From 70cf5ccc4a93f7824fc62e715ef8fa8c063f1491 Mon Sep 17 00:00:00 2001
From: "haowen.han" <haowen.han@mthreads.com>
Date: Mon, 20 May 2024 11:48:06 +0000
Subject: [PATCH 1/6] Revert "paddle_musa v2.6.0 release initialization
 (#64265)"

This reverts commit 6caf5d5cfdae84480dbd31673045355c01b7b3da.
---
 .gitmodules                                   |    8 +
 CMakeLists.txt                                |   71 +-
 README.md                                     |    2 +-
 README_cn.md                                  |    4 +-
 README_ja.md                                  |    2 +-
 cmake/configure.cmake                         |   13 -
 cmake/cupti.cmake                             |    6 +-
 cmake/external/cryptopp.cmake                 |   18 +-
 cmake/external/eigen.cmake                    |   70 -
 cmake/flags.cmake                             |    5 -
 cmake/generic.cmake                           |  143 +-
 cmake/inference_lib.cmake                     |   15 +-
 cmake/mccl.cmake                              |   51 -
 cmake/mudnn.cmake                             |   92 -
 cmake/musa.cmake                              |  128 --
 cmake/operators.cmake                         |   94 +-
 cmake/phi.cmake                               |    2 +-
 paddle/cinn/ir/ir_base.h                      |    9 +-
 paddle/cinn/ir/utils/ir_nodes_collector.cc    |   67 +-
 paddle/common/array.h                         |    8 +-
 paddle/common/hostdevice.h                    |    6 +-
 paddle/common/macros.h                        |    2 +-
 .../distributed/collective/CMakeLists.txt     |    4 +-
 .../collective/process_group_nccl.cc          |   30 +-
 .../collective/process_group_nccl.h           |    2 +-
 .../collective/processgroup_comm_utils.cc     |    6 +-
 .../fluid/distributed/collective/reducer.cc   |    6 +-
 .../distributed/common/chunk_allocator.h      |   14 +-
 .../distributed/fleet_executor/carrier.cc     |    2 +-
 .../fleet_executor/cond_interceptor.cc        |    2 +-
 .../distributed/fleet_executor/dist_model.cc  |    2 +-
 .../distributed/fleet_executor/message_bus.cc |    2 +-
 .../forwards/multiply_fwd_func.cc             |   10 +-
 .../eager/auto_code_generator/CMakeLists.txt  |    4 -
 .../generator/eager_gen.py                    |    2 +-
 .../generator/python_c_gen.py                 |    2 +-
 paddle/fluid/eager/nan_inf_utils.cc           |    2 +-
 paddle/fluid/framework/CMakeLists.txt         |    7 +-
 paddle/fluid/framework/conv_search_cache.h    |   18 -
 paddle/fluid/framework/custom_operator.cc     |    4 +-
 paddle/fluid/framework/data_feed.cc           |    4 +-
 paddle/fluid/framework/data_feed.cu           |   40 +-
 paddle/fluid/framework/data_feed.h            |    2 +-
 paddle/fluid/framework/data_feed_factory.cc   |    2 +-
 paddle/fluid/framework/data_type_transform.cc |    2 +-
 paddle/fluid/framework/details/CMakeLists.txt |   71 +-
 .../framework/details/all_reduce_op_handle.cc |   12 +-
 .../framework/details/all_reduce_op_handle.h  |    8 +-
 .../framework/details/broadcast_op_handle.cc  |    6 +-
 .../framework/details/broadcast_op_handle.h   |    8 +-
 .../fluid/framework/details/build_strategy.cc |   12 +-
 .../fluid/framework/details/build_strategy.h  |    2 +-
 .../details/eager_deletion_op_handle.cc       |   21 +-
 .../details/eager_deletion_op_handle.h        |    2 +-
 .../details/fetch_async_op_handle.cc          |    2 +-
 .../framework/details/fetch_op_handle.cc      |    2 +-
 .../details/fused_all_reduce_op_handle.cc     |   21 +-
 .../details/fused_all_reduce_op_handle.h      |    6 +-
 .../details/fused_broadcast_op_handle.h       |    4 +-
 .../grad_merge_all_reduce_op_handle.cc        |    6 +-
 .../details/grad_merge_all_reduce_op_handle.h |    6 +-
 .../framework/details/nan_inf_utils_detail.cc |    2 +-
 .../fluid/framework/details/nccl_op_handle.h  |   61 +-
 .../fluid/framework/details/op_handle_base.cc |   34 +-
 .../fluid/framework/details/op_handle_base.h  |    2 +-
 .../framework/details/reduce_op_handle.cc     |    8 +-
 .../framework/details/reduce_op_handle.h      |    6 +-
 .../details/scale_loss_grad_op_handle.cc      |    4 +-
 .../details/share_tensor_buffer_op_handle.cc  |    2 +-
 .../details/sparse_all_reduce_op_handle.cc    |    6 +-
 paddle/fluid/framework/details/var_handle.h   |    4 +-
 paddle/fluid/framework/device_worker.h        |   20 +-
 .../fluid/framework/device_worker_factory.cc  |    4 +-
 paddle/fluid/framework/dlpack_tensor.cc       |    4 +-
 paddle/fluid/framework/fleet/CMakeLists.txt   |   14 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   |   22 -
 paddle/fluid/framework/fleet/box_wrapper.h    |    3 -
 .../fluid/framework/fleet/box_wrapper_impl.h  |   13 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |    5 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |    2 +-
 .../framework/fleet/heter_ps/CMakeLists.txt   |   15 -
 .../fleet/heter_ps/graph_gpu_wrapper.cu       |    6 +-
 .../fleet/heter_ps/graph_gpu_wrapper.h        |    8 +-
 .../framework/fleet/heter_ps/heter_comm.h     |    8 +-
 .../framework/fleet/heter_ps/heter_comm_inl.h |   16 +-
 .../framework/fleet/heter_ps/heter_ps.cu      |    4 +-
 .../fluid/framework/fleet/heter_ps/heter_ps.h |    4 +-
 .../framework/fleet/heter_ps/heter_ps_base.h  |    4 +-
 paddle/fluid/framework/fleet/heter_wrapper.cc |    6 +-
 paddle/fluid/framework/fleet/heter_wrapper.h  |    2 +-
 paddle/fluid/framework/fleet/nccl_wrapper.cc  |   22 +-
 paddle/fluid/framework/fleet/nccl_wrapper.h   |   10 +-
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h |   12 +-
 paddle/fluid/framework/garbage_collector.cc   |    8 +-
 paddle/fluid/framework/garbage_collector.h    |    2 +-
 paddle/fluid/framework/hogwild_worker.cc      |   18 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |    8 +-
 paddle/fluid/framework/ir/cost_model.cc       |    4 +-
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc |    4 +-
 .../framework/ir/fuse_bn_add_act_pass.cc      |    4 +-
 .../framework/ir/fusion_group/CMakeLists.txt  |    2 +-
 .../ir/fusion_group/code_generator_tester.cc  |    2 +-
 .../ir/fusion_group/cuda_resources.h          |    2 +-
 paddle/fluid/framework/ir/graph_helper.cc     |    6 +-
 ...est_reference_count_pass_last_lived_ops.cc |    2 +-
 .../all_reduce_deps_pass.cc                   |    2 +-
 .../fuse_all_reduce_op_pass.cc                |   16 +-
 .../multi_devices_graph_pass.cc               |   18 +-
 .../multi_devices_graph_pass.h                |    4 +-
 .../instruction/instruction_util.cc           |    6 +-
 .../interpreter/execution_config.cc           |    2 +-
 .../interpreter/interpreter_util.cc           |    2 +-
 .../interpreter/stream_analyzer.cc            |    4 +-
 .../new_executor/interpreter_base_impl.h      |    4 +-
 .../new_executor/new_executor_defs.cc         |    4 +-
 .../new_executor/new_executor_defs.h          |    4 +-
 .../framework/new_executor/pir_interpreter.cc |   10 +-
 .../fluid/framework/new_executor/profiler.h   |    2 +-
 .../new_executor/program_interpreter.cc       |   22 +-
 .../new_executor/program_interpreter.h        |    4 +-
 paddle/fluid/framework/op_registry.h          |    4 +-
 paddle/fluid/framework/operator.cc            |   18 +-
 paddle/fluid/framework/operator.h             |    2 +-
 paddle/fluid/framework/parallel_executor.cc   |   60 +-
 paddle/fluid/framework/parallel_executor.h    |    2 +-
 paddle/fluid/framework/phi_utils.cc           |    2 +-
 paddle/fluid/framework/phi_utils.h            |    2 +-
 paddle/fluid/framework/pipeline_trainer.cc    |    4 +-
 paddle/fluid/framework/ps_gpu_trainer.cc      |    2 +-
 paddle/fluid/framework/ps_gpu_worker.cc       |    6 +-
 paddle/fluid/framework/pull_dense_worker.cc   |   14 +-
 paddle/fluid/framework/section_worker.cc      |    4 +-
 paddle/fluid/framework/tensor_util.cc         |   14 +-
 paddle/fluid/framework/tensor_util.h          |    8 +-
 paddle/fluid/framework/trainer.h              |   12 +-
 paddle/fluid/framework/trainer_factory.cc     |    6 +-
 paddle/fluid/framework/var_type_traits.cc     |    7 -
 paddle/fluid/framework/var_type_traits.h      |   20 +-
 paddle/fluid/imperative/CMakeLists.txt        |   11 +-
 paddle/fluid/imperative/all_reduce.cc         |   37 +-
 paddle/fluid/imperative/all_reduce.h          |    2 +-
 paddle/fluid/imperative/amp_auto_cast.cc      |    2 +-
 paddle/fluid/imperative/gloo_context.cc       |    2 +-
 .../fluid/imperative/gradient_accumulator.cc  |   18 +-
 paddle/fluid/imperative/nccl_context.cc       |   22 +-
 paddle/fluid/imperative/nccl_context.h        |   10 +-
 paddle/fluid/imperative/prepared_operator.cc  |    6 +-
 paddle/fluid/imperative/reducer.cc            |   10 +-
 paddle/fluid/imperative/reducer.cu            |    2 +-
 paddle/fluid/imperative/reducer.h             |    2 +-
 paddle/fluid/imperative/tracer.cc             |    6 +-
 paddle/fluid/inference/CMakeLists.txt         |    2 +-
 .../ir_params_sync_among_devices_pass.cc      |    4 +-
 .../ir_params_sync_among_devices_pass.h       |    2 +-
 paddle/fluid/inference/api/analysis_config.cc |   15 +-
 .../fluid/inference/api/analysis_predictor.cc |   35 +-
 .../fluid/inference/api/analysis_predictor.h  |    2 +-
 paddle/fluid/inference/api/api_impl.cc        |    2 +-
 .../inference/api/details/zero_copy_tensor.cc |   21 +-
 paddle/fluid/inference/api/infer_context.cc   |    2 +-
 paddle/fluid/inference/api/infer_context.h    |    4 +-
 .../inference/api/paddle_analysis_config.h    |    3 +-
 paddle/fluid/inference/api/paddle_api.h       |    3 -
 .../inference/api/paddle_pass_builder.cc      |    5 +-
 .../fluid/inference/api/resource_manager.cc   |   79 +-
 paddle/fluid/inference/api/resource_manager.h |   26 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |    2 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  |    8 +-
 .../tensorrt/plugin/c_allreduce_op_plugin.cu  |   28 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |    3 -
 paddle/fluid/inference/utils/CMakeLists.txt   |   13 -
 paddle/fluid/inference/utils/benchmark.cc     |   54 -
 paddle/fluid/inference/utils/benchmark.h      |   56 -
 .../fluid/inference/utils/benchmark_tester.cc |   40 -
 .../inference/utils/table_printer_tester.cc   |   82 -
 paddle/fluid/memory/CMakeLists.txt            |   11 -
 paddle/fluid/memory/allocation/CMakeLists.txt |   14 +-
 paddle/fluid/memory/allocation/allocator.h    |   14 +-
 .../memory/allocation/allocator_facade.cc     |   32 +-
 .../memory/allocation/allocator_facade.h      |    2 +-
 .../memory/allocation/buddy_allocator.cc      |    6 +-
 .../fluid/memory/allocation/cuda_allocator.cc |    4 -
 .../cuda_device_context_allocator.h           |    9 +-
 .../allocation/cuda_managed_allocator.cc      |    5 -
 .../allocation/naive_best_fit_allocator.cc    |   28 +-
 .../memory/allocation/pinned_allocator.cc     |    4 -
 .../allocation/stream_safe_cuda_allocator.cc  |   16 -
 .../allocation/stream_safe_cuda_allocator.h   |    3 -
 .../memory/allocation/system_allocator.cc     |   22 +-
 .../memory/allocation/system_allocator.h      |    2 +-
 paddle/fluid/memory/malloc.cc                 |    2 +-
 paddle/fluid/memory/malloc.h                  |    2 +-
 paddle/fluid/memory/memcpy.cc                 |   83 +-
 paddle/fluid/operators/CMakeLists.txt         |   12 +-
 paddle/fluid/operators/affine_channel_op.cu   |    2 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |    2 +-
 paddle/fluid/operators/batch_norm_op.cu       |    2 +-
 .../fluid/operators/class_center_sample_op.cu |   23 +-
 .../fluid/operators/collective/CMakeLists.txt |    2 +-
 .../operators/collective/alltoall_op.cu.cc    |   10 +-
 .../operators/collective/barrier_op.cu.cc     |   12 +-
 .../operators/collective/c_allgather_op.cu.cc |   10 +-
 .../collective/c_allreduce_max_op.cu.cc       |    4 +-
 .../operators/collective/c_allreduce_op.h     |   20 +-
 .../collective/c_allreduce_sum_op.cu.cc       |    4 +-
 .../operators/collective/c_broadcast_op.cu.cc |   14 +-
 .../collective/c_comm_init_all_op.cc          |    4 +-
 .../collective/c_comm_init_multitrainer_op.cc |   10 +-
 .../operators/collective/c_comm_init_op.cc    |   17 +-
 .../operators/collective/c_concat_op.cu.cc    |   14 +-
 .../operators/collective/c_gen_nccl_id_op.cc  |   14 +-
 .../fluid/operators/collective/c_reduce_op.h  |   20 +-
 .../collective/c_reducescatter_op.cu.cc       |   16 +-
 .../operators/collective/c_scatter_op.cu.cc   |   10 +-
 .../c_softmax_with_cross_entropy_op.cu        |   20 +-
 .../collective/c_sync_calc_stream_op.h        |    2 +-
 .../collective/c_sync_comm_stream_op.h        |    6 +-
 .../operators/collective/c_wait_comm_op.cc    |    7 +-
 .../operators/collective/c_wait_compute_op.cc |    7 +-
 .../operators/collective/gen_nccl_id_op.cc    |   14 +-
 .../collective/global_gather_op.cu.cc         |   38 +-
 .../collective/global_scatter_op.cu.cc        |   38 +-
 .../collective/mp_allreduce_sum_op.cu.cc      |    4 +-
 .../collective/partial_allgather_op.cu.cc     |   14 +-
 .../collective/partial_recv_op.cu.cc          |   14 +-
 .../collective/partial_send_op.cu.cc          |   14 +-
 .../operators/collective/recv_v2_op.cu.cc     |   24 +-
 .../operators/collective/send_v2_op.cu.cc     |   26 +-
 .../controlflow/conditional_block_op.h        |    2 +-
 paddle/fluid/operators/controlflow/feed_op.cc |    2 +-
 .../operators/controlflow/get_places_op.cc    |    4 +-
 .../operators/controlflow/while_op_helper.cc  |    2 +-
 paddle/fluid/operators/data_norm_op.cu        |   28 +-
 .../fluid/operators/detection/CMakeLists.txt  |    4 +-
 .../fluid/operators/detection/bbox_util.cu.h  |    2 +-
 .../detection/collect_fpn_proposals_op.cu     |    2 +-
 paddle/fluid/operators/dgc_clip_by_norm_op.h  |   76 +-
 .../elementwise/elementwise_op_function.h     |   19 +-
 paddle/fluid/operators/expand_op.cc           |    2 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |    2 -
 paddle/fluid/operators/fused/CMakeLists.txt   |   12 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |    2 +-
 .../operators/fused/fused_attention_utils.h   |   10 +-
 .../operators/fused/fused_dropout_common.h    |    4 +-
 .../fused/fused_multi_transformer_op.cu.h     |   12 +-
 .../operators/fused/fused_seqpool_cvm_op.cu   |   59 -
 .../fluid/operators/fused/yolo_box_post_op.cu |   39 -
 .../get_tensor_from_selected_rows_op.cc       |    2 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |   11 -
 .../operators/grid_sampler_cudnn_op.cu.cc     |    2 +-
 paddle/fluid/operators/hinge_loss_op.cc       |    2 +-
 paddle/fluid/operators/im2sequence_op.cc      |    2 +-
 paddle/fluid/operators/isfinite_op.h          |    8 +-
 paddle/fluid/operators/l1_norm_op.cc          |    2 +-
 paddle/fluid/operators/load_op.cc             |    2 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |    2 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |    3 -
 .../operators/margin_cross_entropy_op.cu      |   38 +-
 .../operators/math/bert_encoder_functor.h     |    8 +-
 paddle/fluid/operators/math/gru_compute.cc    |    8 +-
 paddle/fluid/operators/math/inclusive_scan.h  |    2 +-
 paddle/fluid/operators/math/prelu.h           |    2 +-
 paddle/fluid/operators/math/sample_prob.cu    |    5 -
 paddle/fluid/operators/math/sample_prob.h     |    2 +-
 paddle/fluid/operators/matmul_op.cc           |   12 +-
 paddle/fluid/operators/memcpy_h2d_op.h        |    2 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |    2 +-
 paddle/fluid/operators/minus_op.cc            |    2 +-
 paddle/fluid/operators/nccl/CMakeLists.txt    |   11 +-
 .../fluid/operators/nccl/nccl_gpu_common.cc   |   10 +-
 paddle/fluid/operators/nccl/nccl_gpu_common.h |    4 +-
 paddle/fluid/operators/nccl/nccl_op.cc        |   24 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     |   34 +-
 .../optimizers/distributed_fused_lamb_op.cu   |  188 +-
 .../operators/optimizers/sparse_momentum_op.h |    5 +-
 .../operators/pscore/send_and_recv_op.cc      |    2 +-
 paddle/fluid/operators/rank_loss_op.cc        |    2 +-
 .../fluid/operators/reader/buffered_reader.cc |    9 +-
 .../fluid/operators/reader/buffered_reader.h  |    4 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |    4 +-
 paddle/fluid/operators/reshape_op.cc          |    8 +-
 paddle/fluid/operators/save_op.cc             |    2 +-
 paddle/fluid/operators/select_op_helper.h     |    2 +-
 .../sequence_ops/sequence_reverse_op.h        |    4 +-
 .../sequence_softmax_cudnn_op.cu.cc           |   69 +-
 .../sequence_ops/sequence_softmax_op.cc       |    2 +-
 .../sequence_ops/sequence_softmax_op.cu       |    4 -
 paddle/fluid/operators/set_value_op.cc        |   44 +-
 paddle/fluid/operators/split_lod_tensor_op.cc |    2 +-
 paddle/fluid/operators/svd_helper.h           |    2 +-
 paddle/fluid/operators/sync_batch_norm_op.cu  |   94 +-
 .../fluid/operators/sync_batch_norm_utils.h   |   15 +-
 paddle/fluid/operators/top_k_op.cu            |    3 +-
 paddle/fluid/operators/uniform_random_op.h    |    4 +-
 paddle/fluid/platform/CMakeLists.txt          |   66 +-
 paddle/fluid/platform/collective_helper.cc    |   32 +-
 paddle/fluid/platform/collective_helper.h     |   14 +-
 paddle/fluid/platform/device/CMakeLists.txt   |    2 +-
 paddle/fluid/platform/device/device_wrapper.h |    2 +-
 .../fluid/platform/device/gpu/CMakeLists.txt  |   12 -
 paddle/fluid/platform/device/gpu/gpu_helper.h |    4 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |   13 -
 paddle/fluid/platform/device/gpu/gpu_info.h   |    2 +-
 .../platform/device/gpu/gpu_launch_config.h   |    4 +-
 .../platform/device/gpu/gpu_resource_pool.cc  |   12 +-
 .../platform/device/gpu/gpu_resource_pool.h   |    7 +-
 paddle/fluid/platform/device/gpu/gpu_types.h  |  124 +-
 .../platform/device/gpu/musa/musa_helper.h    |  104 --
 .../fluid/platform/device/gpu/nccl_helper.h   |   86 +-
 paddle/fluid/platform/device_context.cc       |   10 +-
 paddle/fluid/platform/device_context.h        |   16 +-
 paddle/fluid/platform/device_event.h          |    2 +-
 paddle/fluid/platform/device_event_base.cc    |    8 -
 paddle/fluid/platform/device_event_gpu.cc     |    2 +-
 paddle/fluid/platform/dynload/CMakeLists.txt  |   22 -
 .../fluid/platform/dynload/dynamic_loader.h   |    1 -
 paddle/fluid/platform/dynload/mccl.cc         |   43 -
 paddle/fluid/platform/dynload/mccl.h          |   51 -
 paddle/fluid/platform/dynload/mublas.cc       |   38 -
 paddle/fluid/platform/dynload/mublas.h        |   55 -
 paddle/fluid/platform/dynload/mudnn.cc        |   30 -
 paddle/fluid/platform/dynload/mudnn.h         |   39 -
 paddle/fluid/platform/dynload/mufft.cc        |   30 -
 paddle/fluid/platform/dynload/mufft.h         |   93 -
 paddle/fluid/platform/dynload/murand.cc       |   27 -
 paddle/fluid/platform/dynload/murand.h        |   43 -
 paddle/fluid/platform/dynload/musa_driver.cc  |   31 -
 paddle/fluid/platform/dynload/musa_driver.h   |   58 -
 paddle/fluid/platform/dynload/musartc.cc      |   31 -
 paddle/fluid/platform/dynload/musartc.h       |   51 -
 paddle/fluid/platform/dynload/musparse.cc     |   30 -
 paddle/fluid/platform/dynload/musparse.h      |   41 -
 paddle/fluid/platform/dynload/nccl.cc         |   16 +-
 paddle/fluid/platform/dynload/nccl.h          |   30 +-
 paddle/fluid/platform/dynload/rccl.cc         |   16 +-
 paddle/fluid/platform/dynload/rccl.h          |   14 +-
 paddle/fluid/platform/enforce.h               |   26 +-
 paddle/fluid/platform/enforce_test.cc         |    4 +-
 paddle/fluid/platform/event.h                 |    5 -
 paddle/fluid/platform/gen_comm_id_helper.cc   |    6 +-
 paddle/fluid/platform/gen_comm_id_helper.h    |    2 +-
 paddle/fluid/platform/init.cc                 |   20 +-
 paddle/fluid/platform/place.h                 |    4 +-
 paddle/fluid/platform/profiler.cc             |    2 +-
 paddle/fluid/platform/profiler.cu             |   19 -
 paddle/fluid/platform/profiler.h              |    4 +-
 .../platform/profiler/chrometracing_logger.cc |   40 +-
 .../platform/profiler/chrometracing_logger.h  |    2 +-
 .../profiler/dump/deserialization_reader.cc   |    4 +-
 .../profiler/dump/deserialization_reader.h    |    2 +-
 .../profiler/dump/serialization_logger.cc     |    2 +-
 .../profiler/dump/serialization_logger.h      |    2 +-
 .../fluid/platform/profiler/event_python.cc   |    6 +-
 paddle/fluid/platform/profiler/event_python.h |    6 +-
 paddle/fluid/platform/profiler/profiler.cc    |   13 +-
 .../fluid/platform/profiler/profiler_test.cc  |    8 -
 paddle/fluid/platform/profiler_helper.h       |   19 +-
 .../fluid/platform/stream_callback_manager.cc |   15 +-
 .../fluid/platform/stream_callback_manager.h  |    5 -
 paddle/fluid/primitive/composite/composite.h  |   45 +-
 paddle/fluid/pybind/CMakeLists.txt            |   30 +-
 paddle/fluid/pybind/communication.cc          |    2 +-
 paddle/fluid/pybind/cuda_streams_py.cc        |   22 +-
 paddle/fluid/pybind/cuda_streams_py.h         |    4 +-
 paddle/fluid/pybind/distributed_py.cc         |    4 +-
 paddle/fluid/pybind/eager.cc                  |    2 +-
 paddle/fluid/pybind/eager_functions.cc        |    2 +-
 paddle/fluid/pybind/eager_math_op_patch.cc    |    2 +-
 paddle/fluid/pybind/eager_method.cc           |  285 +--
 paddle/fluid/pybind/generator_py.cc           |    2 +-
 paddle/fluid/pybind/imperative.cc             |    6 +-
 paddle/fluid/pybind/inference_api.cc          |   14 +-
 paddle/fluid/pybind/parallel_executor.cc      |    8 +-
 paddle/fluid/pybind/place.cc                  |   20 +-
 paddle/fluid/pybind/process_group_utils.h     |    4 +-
 paddle/fluid/pybind/pybind.cc                 |   59 +-
 paddle/fluid/pybind/slice_utils.h             |  151 +-
 paddle/fluid/pybind/tensor.cc                 |   10 +-
 paddle/fluid/pybind/tensor_py.h               |   17 +-
 paddle/phi/CMakeLists.txt                     |   17 +-
 paddle/phi/api/include/context_pool.h         |    2 +-
 paddle/phi/api/include/tensor.h               |    7 +-
 paddle/phi/api/lib/api_gen_utils.cc           |    6 +-
 paddle/phi/api/lib/context_pool.cc            |    4 +-
 paddle/phi/api/lib/data_transform.cc          |    8 +-
 paddle/phi/api/lib/tensor.cc                  |    2 +-
 paddle/phi/api/lib/tensor_utils.cc            |   40 +-
 paddle/phi/api/profiler/event.h               |   32 +-
 paddle/phi/api/yaml/backward.yaml             |    6 +-
 .../phi/api/yaml/generator/dist_bw_api_gen.py |    1 +
 paddle/phi/api/yaml/legacy_backward.yaml      |    6 +-
 paddle/phi/api/yaml/op_compat.yaml            |    2 +-
 paddle/phi/api/yaml/ops.yaml                  |    2 +-
 paddle/phi/backends/CMakeLists.txt            |    6 +-
 paddle/phi/backends/context_pool.cc           |    2 +-
 paddle/phi/backends/context_pool.h            |    4 +-
 paddle/phi/backends/custom/custom_device.cc   |    2 +-
 paddle/phi/backends/device_code.cc            |  144 +-
 paddle/phi/backends/device_code.h             |   16 +-
 paddle/phi/backends/device_memory_aligment.h  |    2 +-
 paddle/phi/backends/dynload/CMakeLists.txt    |   22 -
 paddle/phi/backends/dynload/dynamic_loader.cc |   48 -
 paddle/phi/backends/dynload/dynamic_loader.h  |    1 -
 paddle/phi/backends/dynload/mccl.cc           |   36 -
 paddle/phi/backends/dynload/mccl.h            |   80 -
 paddle/phi/backends/dynload/mublas.cc         |   38 -
 paddle/phi/backends/dynload/mublas.h          |  128 --
 paddle/phi/backends/dynload/mudnn.cc          |   41 -
 paddle/phi/backends/dynload/mudnn.h           |   41 -
 paddle/phi/backends/dynload/mufft.cc          |   43 -
 paddle/phi/backends/dynload/mufft.h           |  155 --
 paddle/phi/backends/dynload/murand.cc         |   28 -
 paddle/phi/backends/dynload/murand.h          |   54 -
 paddle/phi/backends/dynload/musa_driver.cc    |   33 -
 paddle/phi/backends/dynload/musa_driver.h     |   69 -
 paddle/phi/backends/dynload/musartc.cc        |   34 -
 paddle/phi/backends/dynload/musartc.h         |  147 --
 paddle/phi/backends/dynload/musparse.cc       |   29 -
 paddle/phi/backends/dynload/musparse.h        |   76 -
 paddle/phi/backends/dynload/nccl.h            |   14 +-
 paddle/phi/backends/dynload/rccl.h            |   14 +-
 paddle/phi/backends/gpu/forwards.h            |   19 -
 paddle/phi/backends/gpu/gpu_context.cc        |  176 +-
 paddle/phi/backends/gpu/gpu_context.h         |   22 +-
 paddle/phi/backends/gpu/gpu_decls.h           |   81 +-
 paddle/phi/backends/gpu/gpu_device_function.h |    4 +-
 paddle/phi/backends/gpu/gpu_dnn.h             |    5 +-
 paddle/phi/backends/gpu/gpu_helper.h          |    4 +-
 paddle/phi/backends/gpu/gpu_info.h            |    2 +-
 paddle/phi/backends/gpu/gpu_launch_config.h   |    4 +-
 paddle/phi/backends/gpu/gpu_primitives.h      |  186 +-
 paddle/phi/backends/gpu/gpu_resources.cc      |  175 +-
 paddle/phi/backends/gpu/gpu_resources.h       |    8 +-
 paddle/phi/backends/gpu/gpu_types.h           |   70 +-
 paddle/phi/backends/gpu/musa/mudnn_desc.h     |  202 ---
 paddle/phi/backends/gpu/musa/mudnn_helper.h   |  323 ----
 .../backends/gpu/musa/musa_device_function.h  |  193 --
 paddle/phi/backends/gpu/musa/musa_helper.h    |   74 -
 paddle/phi/backends/gpu/musa/musa_info.cc     |  334 ----
 paddle/phi/capi/include/c_meta_tensor.h       |   12 +
 paddle/phi/capi/include/c_tensor.h            |   17 +
 paddle/phi/capi/include/wrapper_base.h        |   66 +
 paddle/phi/capi/lib/c_device_context.cc       |    2 +-
 paddle/phi/capi/lib/c_kernel_context.cc       |    2 +-
 paddle/phi/capi/lib/c_meta_tensor.cc          |   46 +
 paddle/phi/capi/lib/c_tensor.cc               |   72 +
 paddle/phi/common/backend.h                   |    2 +-
 paddle/phi/common/bfloat16.h                  |   40 +-
 paddle/phi/common/complex.h                   |   19 +-
 paddle/phi/common/cpstring_impl.h             |    6 +-
 paddle/phi/common/float16.h                   |   53 +-
 paddle/phi/common/memory_utils.cc             |    6 +-
 paddle/phi/common/memory_utils.h              |   23 +-
 paddle/phi/common/place.cc                    |    4 +-
 paddle/phi/common/transform.h                 |   17 +-
 paddle/phi/core/compat/convert_utils.cc       |    6 +-
 paddle/phi/core/cuda_stream.h                 |   22 -
 paddle/phi/core/distributed/CMakeLists.txt    |    2 +-
 .../auto_parallel/reshard/reshard_utils.cc    |    4 +-
 .../auto_parallel/reshard/reshard_utils.h     |    4 +-
 .../phi/core/distributed/check/CMakeLists.txt |    2 +-
 .../distributed/check/nccl_dynamic_check.cc   |   38 +-
 .../distributed/check/nccl_dynamic_check.h    |   10 +-
 .../core/distributed/comm_context_manager.cc  |   16 +-
 .../core/distributed/comm_context_manager.h   |    8 +-
 paddle/phi/core/distributed/comm_task.h       |    9 +-
 .../phi/core/distributed/comm_task_manager.cc |    2 +-
 .../phi/core/distributed/nccl_comm_context.cc |   50 +-
 .../phi/core/distributed/nccl_comm_context.h  |   31 +-
 paddle/phi/core/distributed/nccl_comm_task.cc |   55 +-
 paddle/phi/core/distributed/nccl_comm_task.h  |    6 +-
 paddle/phi/core/distributed/nccl_tools.cc     |   76 +-
 paddle/phi/core/distributed/nccl_tools.h      |   36 +-
 paddle/phi/core/enforce.h                     |  272 +--
 paddle/phi/core/flags.cc                      |   22 +-
 paddle/phi/core/generator.cc                  |    5 +-
 paddle/phi/core/hostdevice.h                  |    6 +-
 paddle/phi/core/kernel_factory.cc             |    4 +-
 paddle/phi/core/kernel_registry.cc            |    2 +-
 paddle/phi/core/kernel_registry.h             |    2 +-
 paddle/phi/core/kernel_utils.h                |    2 +-
 paddle/phi/core/mixed_vector.cc               |    4 +-
 paddle/phi/core/string_tensor.cc              |    4 +-
 paddle/phi/core/tensor_utils.cc               |   16 +-
 paddle/phi/core/utils/data_type.h             |   29 +-
 paddle/phi/core/utils/type_info.cc            |    4 +-
 paddle/phi/core/utils/visit_place.h           |    4 +-
 paddle/phi/core/visit_type.h                  |    4 +-
 paddle/phi/infermeta/multiary.cc              |    2 +-
 paddle/phi/kernels/CMakeLists.txt             |   76 +-
 paddle/phi/kernels/array_kernel.cc            |    8 +-
 paddle/phi/kernels/assign_kernel.cc           |    2 +-
 paddle/phi/kernels/autotune/gpu_timer.h       |   39 +-
 paddle/phi/kernels/batch_norm_kernel.cc       |    2 +-
 .../kernels/check_memory_continue_kernel.cc   |    2 +-
 paddle/phi/kernels/coalesce_tensor_kernel.cc  |   14 -
 .../phi/kernels/cpu/cum_maxmin_grad_kernel.cc |    8 +-
 paddle/phi/kernels/cpu/decode_jpeg_kernel.cc  |    2 +-
 paddle/phi/kernels/cpu/gelu_grad_kernel.cc    |    2 +-
 paddle/phi/kernels/cpu/gelu_kernel.cc         |    2 +-
 .../kernels/cpu/put_along_axis_grad_kernel.cc |  149 +-
 .../phi/kernels/cpu/put_along_axis_kernel.cc  |   40 +-
 .../cpu/repeat_interleave_grad_kernel.cc      |    6 +-
 .../kernels/cpu/repeat_interleave_kernel.cc   |    6 +-
 .../phi/kernels/cpu/set_value_grad_kernel.cc  |   17 +
 .../cpu/take_along_axis_grad_kernel.cc        |    3 +-
 .../phi/kernels/cpu/take_along_axis_kernel.cc |    6 +-
 .../kernels/custom/c_embedding_grad_kernel.cc |   93 +
 .../phi/kernels/custom/c_embedding_kernel.cc  |   84 +
 paddle/phi/kernels/dist_grad_kernel.cc        |    2 +-
 paddle/phi/kernels/empty_kernel.cc            |    2 +-
 paddle/phi/kernels/flatten_grad_kernel.cc     |    2 +-
 paddle/phi/kernels/flatten_kernel.cc          |    2 +-
 paddle/phi/kernels/full_kernel.cc             |    2 +-
 paddle/phi/kernels/funcs/CMakeLists.txt       |    8 +-
 paddle/phi/kernels/funcs/activation_functor.h |    2 +-
 paddle/phi/kernels/funcs/algorithm.h          |    4 +-
 paddle/phi/kernels/funcs/blas/blas.h          |   14 +-
 paddle/phi/kernels/funcs/blas/blas_impl.h     |    4 +-
 paddle/phi/kernels/funcs/blas/blas_impl.mu.h  | 1602 -----------------
 paddle/phi/kernels/funcs/broadcast_function.h |    4 +-
 .../phi/kernels/funcs/check_numerics_utils.h  |    2 +-
 .../kernels/funcs/concat_and_split_functor.cu |    2 +-
 .../phi/kernels/funcs/detail/gru_cpu_kernel.h |    2 +-
 .../phi/kernels/funcs/detail/gru_gpu_kernel.h |    4 +-
 paddle/phi/kernels/funcs/detail/gru_kernel.h  |   10 +-
 .../kernels/funcs/detail/lstm_cpu_kernel.h    |    2 +-
 paddle/phi/kernels/funcs/detail/lstm_kernel.h |    4 +-
 .../phi/kernels/funcs/detail/strided_memcpy.h |    6 +-
 paddle/phi/kernels/funcs/diagonal.h           |    6 +-
 .../phi/kernels/funcs/distribution_helper.h   |   48 +-
 paddle/phi/kernels/funcs/dropout_impl.cu.h    |   23 +-
 paddle/phi/kernels/funcs/elementwise_base.h   |    6 +-
 .../phi/kernels/funcs/elementwise_functor.h   |    2 +-
 .../phi/kernels/funcs/elementwise_grad_base.h |    4 +-
 .../funcs/emb_eltwise_layer_norm_functor.cu   |    7 +-
 paddle/phi/kernels/funcs/fc_functor.cu        |    6 +-
 paddle/phi/kernels/funcs/fft.cu               |    7 +-
 paddle/phi/kernels/funcs/fft_cache.h          |    2 -
 paddle/phi/kernels/funcs/fft_fill_conj.h      |    4 +-
 paddle/phi/kernels/funcs/for_range.h          |    2 +-
 .../kernels/funcs/gather_scatter_functor.cc   |  456 ++++-
 .../kernels/funcs/gather_scatter_functor.cu   |  951 +++++++++-
 .../kernels/funcs/gather_scatter_functor.h    |  183 ++
 paddle/phi/kernels/funcs/gru_compute.cc       |    8 +-
 paddle/phi/kernels/funcs/inclusive_scan.h     |    2 +-
 paddle/phi/kernels/funcs/index_calculator.h   |    2 +-
 paddle/phi/kernels/funcs/index_put_utils.h    |  163 +-
 .../phi/kernels/funcs/interpolate_function.h  |    4 +-
 paddle/phi/kernels/funcs/isfinite_functor.h   |    6 +-
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h |    2 +-
 paddle/phi/kernels/funcs/layer_norm_util.h    |    4 +-
 paddle/phi/kernels/funcs/load_store_util.h    |    2 +-
 paddle/phi/kernels/funcs/math_cuda_utils.h    |   17 +-
 paddle/phi/kernels/funcs/math_function.cc     |    2 +-
 paddle/phi/kernels/funcs/math_function.h      |    2 +-
 paddle/phi/kernels/funcs/matrix_inverse.cu    |    2 +-
 paddle/phi/kernels/funcs/matrix_solve.cu      |    2 +-
 paddle/phi/kernels/funcs/mode.h               |    4 +-
 paddle/phi/kernels/funcs/mufft_util.h         |  130 --
 .../kernels/funcs/multihead_matmul_functor.cu |   10 +-
 paddle/phi/kernels/funcs/norm_utils.cu.h      |    2 +-
 paddle/phi/kernels/funcs/pooling.h            |    6 +-
 paddle/phi/kernels/funcs/reduce_function.h    |    6 +-
 paddle/phi/kernels/funcs/segmented_array.h    |    2 +-
 paddle/phi/kernels/funcs/select_impl.cu.h     |    4 +-
 .../kernels/funcs/skip_layernorm_functor.cu   |    8 +-
 .../kernels/funcs/skip_layernorm_functor.h    |    6 -
 paddle/phi/kernels/funcs/softmax.cu           |   36 +-
 paddle/phi/kernels/funcs/softmax.h            |    2 +-
 paddle/phi/kernels/funcs/sparse/softmax.cu.h  |    4 -
 paddle/phi/kernels/funcs/sparse/sparse_blas.h |    4 -
 paddle/phi/kernels/funcs/squared_l2_norm.h    |    6 +-
 paddle/phi/kernels/funcs/strided_memcpy.h     |    2 +-
 .../phi/kernels/funcs/top_k_function_cuda.h   |   29 +-
 .../cutlass/fused_conv2d_add_act_kernel.cu    |    1 -
 paddle/phi/kernels/fusion/gpu/block_attn.h    |    1 +
 .../fusion/gpu/fused_bias_act_kernel.cu       |    4 +-
 .../kernels/fusion/gpu/fused_bias_act_utils.h |    4 +-
 ...dropout_residual_layer_norm_grad_kernel.cu |    6 +-
 ...bias_dropout_residual_layer_norm_kernel.cu |    4 +-
 .../gpu/fused_bn_activation_grad_kernel.cu    |    2 +-
 .../fusion/gpu/fused_bn_activation_kernel.cu  |    2 +-
 .../fused_bn_add_activation_grad_kernel.cu    |    2 +-
 .../gpu/fused_bn_add_activation_kernel.cu     |    2 +-
 .../gpu/fused_dropout_add_grad_kernel.cu      |    6 +-
 .../fusion/gpu/fused_dropout_add_kernel.cu    |    6 +-
 .../fused_fc_elementwise_layernorm_kernel.cu  |    4 +-
 .../fusion/gpu/fused_layernorm_kernel.cu      |   11 +-
 .../phi/kernels/fusion/gpu/fused_rope_utils.h |   16 +-
 .../fused_softmax_mask_upper_triangle_utils.h |    8 +-
 .../fusion/gpu/fused_softmax_mask_utils.h     |   10 +-
 .../gpu/masked_multihead_attention_kernel.cu  |    4 +-
 paddle/phi/kernels/fusion/gpu/mmha_util.cu.h  |    2 +-
 .../fusion/gpu/multihead_matmul_kernel.cu     |    2 -
 .../phi/kernels/gpu/activation_grad_kernel.cu |   12 +-
 paddle/phi/kernels/gpu/activation_kernel.cu   |   12 +-
 paddle/phi/kernels/gpu/all_gather_kernel.cu   |    4 +-
 paddle/phi/kernels/gpu/all_reduce_kernel.cu   |   21 +-
 paddle/phi/kernels/gpu/all_to_all_kernel.cu   |   41 +-
 paddle/phi/kernels/gpu/allclose_kernel.cu     |    2 -
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  |    4 +-
 paddle/phi/kernels/gpu/argsort_grad_kernel.cu |    3 +-
 paddle/phi/kernels/gpu/argsort_kernel.cu      |    2 +-
 paddle/phi/kernels/gpu/auc_kernel.cu          |   10 +-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |   16 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   18 +-
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |   10 +-
 paddle/phi/kernels/gpu/broadcast_kernel.cu    |    4 +-
 .../phi/kernels/gpu/check_numerics_kernel.cu  |    6 -
 paddle/phi/kernels/gpu/cholesky_kernel.cu     |    2 +-
 .../kernels/gpu/cholesky_solve_grad_kernel.cu |    2 +-
 .../phi/kernels/gpu/cholesky_solve_kernel.cu  |    2 +-
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |    4 +-
 .../phi/kernels/gpu/cross_entropy_kernel.cu   |   34 +-
 paddle/phi/kernels/gpu/cum_kernel.cu          |    2 -
 .../phi/kernels/gpu/cum_maxmin_grad_kernel.cu |    8 +-
 paddle/phi/kernels/gpu/cumprod_grad_kernel.cu |    2 -
 paddle/phi/kernels/gpu/decode_jpeg_kernel.cu  |    2 +-
 paddle/phi/kernels/gpu/dgc_kernel.cu          |    2 +-
 paddle/phi/kernels/gpu/dirichlet_kernel.cu    |   12 -
 paddle/phi/kernels/gpu/dist_concat_kernel.cu  |    4 +-
 paddle/phi/kernels/gpu/dist_kernel.cu         |    2 +-
 .../gpu/distribute_fpn_proposals_kernel.cu    |    2 +-
 paddle/phi/kernels/gpu/eigh_kernel.cu         |    2 +-
 paddle/phi/kernels/gpu/eigvalsh_kernel.cu     |    2 +-
 .../phi/kernels/gpu/embedding_grad_kernel.cu  |    3 -
 .../kernels/gpu/generate_proposals_kernel.cu  |    2 +-
 .../phi/kernels/gpu/graph_reindex_kernel.cu   |   12 -
 .../gpu/graph_sample_neighbors_kernel.cu      |   17 -
 .../kernels/gpu/graph_send_ue_recv_funcs.h    |    9 -
 paddle/phi/kernels/gpu/group_norm_kernel.cu   |   13 +-
 paddle/phi/kernels/gpu/group_norm_utils.h     |    2 +-
 .../phi/kernels/gpu/gumbel_softmax_kernel.cu  |    5 +-
 paddle/phi/kernels/gpu/instance_norm_utils.h  |    2 +-
 .../kernels/gpu/interpolate_grad_kernel.cu    |    2 +-
 paddle/phi/kernels/gpu/kthvalue_kernel.cu     |   12 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |    2 +-
 .../phi/kernels/gpu/logsumexp_function.cu.h   |   58 -
 paddle/phi/kernels/gpu/lstsq_kernel.cu        |    2 +-
 paddle/phi/kernels/gpu/lu_kernel.cu           |    2 +-
 paddle/phi/kernels/gpu/matrix_rank_kernel.cu  |    2 +-
 .../phi/kernels/gpu/matrix_rank_tol_kernel.cu |    2 +-
 .../phi/kernels/gpu/multiclass_nms3_kernel.cu |    2 +-
 paddle/phi/kernels/gpu/multinomial_kernel.cu  |   12 +-
 .../phi/kernels/gpu/nll_loss_grad_kernel.cu   |    2 -
 paddle/phi/kernels/gpu/nll_loss_kernel.cu     |    2 -
 paddle/phi/kernels/gpu/nonzero_kernel.cu      |    2 +-
 paddle/phi/kernels/gpu/nop_kernel.cu          |    2 +-
 paddle/phi/kernels/gpu/norm_grad_kernel.cu    |    2 +-
 paddle/phi/kernels/gpu/norm_kernel.cu         |    2 +-
 paddle/phi/kernels/gpu/p_recv_kernel.cu       |   17 +-
 paddle/phi/kernels/gpu/p_send_kernel.cu       |   19 +-
 paddle/phi/kernels/gpu/poisson_kernel.cu      |   20 +-
 .../kernels/gpu/put_along_axis_grad_kernel.cu |  122 +-
 .../phi/kernels/gpu/put_along_axis_kernel.cu  |   40 +-
 paddle/phi/kernels/gpu/qr_kernel.cu           |    2 +-
 paddle/phi/kernels/gpu/randperm_kernel.cu     |   15 +-
 paddle/phi/kernels/gpu/reduce.h               |    2 +-
 paddle/phi/kernels/gpu/reduce_grad.h          |    2 +-
 paddle/phi/kernels/gpu/reduce_kernel.cu       |   17 +-
 .../phi/kernels/gpu/reduce_scatter_kernel.cu  |    6 +-
 .../gpu/repeat_interleave_grad_kernel.cu      |    6 +-
 .../kernels/gpu/repeat_interleave_kernel.cu   |    6 +-
 paddle/phi/kernels/gpu/rms_norm_kernel.cu     |    9 +-
 paddle/phi/kernels/gpu/rnn_functor.h          |   55 -
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       |    2 +-
 .../kernels/gpu/send_u_recv_grad_kernel.cu    |    2 -
 paddle/phi/kernels/gpu/send_u_recv_kernel.cu  |    2 -
 .../kernels/gpu/send_ue_recv_grad_kernel.cu   |   27 -
 paddle/phi/kernels/gpu/send_ue_recv_kernel.cu |    3 -
 paddle/phi/kernels/gpu/send_uv_grad_kernel.cu |   15 -
 .../phi/kernels/gpu/set_value_grad_kernel.cu  |   17 +
 paddle/phi/kernels/gpu/sgd_kernel.cu          |   16 -
 .../kernels/gpu/shuffle_batch_grad_kernel.cu  |    2 +-
 .../phi/kernels/gpu/shuffle_batch_kernel.cu   |    4 +-
 paddle/phi/kernels/gpu/shuffle_batch_utils.h  |    2 +-
 .../gpu/sigmoid_cross_entropy_with_logits.h   |    3 +-
 paddle/phi/kernels/gpu/strided_copy_kernel.cu |   11 +-
 paddle/phi/kernels/gpu/svd_kernel.cu          |    2 +-
 .../gpu/take_along_axis_grad_kernel.cu        |    3 +-
 .../phi/kernels/gpu/take_along_axis_kernel.cu |    6 +-
 .../phi/kernels/gpu/top_p_sampling_kernel.cu  |   34 +-
 paddle/phi/kernels/gpu/unique_kernel.cu       |   34 +-
 .../phi/kernels/gpu/viterbi_decode_kernel.cu  |    2 +-
 .../gpu/weighted_sample_neighbors_kernel.cu   |    6 -
 .../kernels/gpudnn/affine_grid_grad_kernel.cu |    2 +-
 .../phi/kernels/gpudnn/affine_grid_kernel.cu  |    2 +-
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h    |   12 -
 .../phi/kernels/gpudnn/softmax_grad_kernel.cu |    2 +-
 paddle/phi/kernels/gpudnn/softmax_kernel.cu   |    2 +-
 paddle/phi/kernels/group_norm_kernel.h        |    2 +-
 .../phi/kernels/impl/clip_grad_kernel_impl.h  |    4 +-
 paddle/phi/kernels/impl/clip_kernel_impl.h    |    4 +-
 paddle/phi/kernels/impl/complex_kernel_impl.h |    2 +-
 paddle/phi/kernels/impl/diag_embed_impl.h     |    4 +-
 .../phi/kernels/impl/dot_grad_kernel_impl.h   |   12 +-
 .../impl/elementwise_grad_kernel_impl.h       |    2 +-
 .../kernels/impl/elementwise_kernel_impl.h    |    2 +-
 .../phi/kernels/impl/fft_grad_kernel_impl.h   |    2 +-
 paddle/phi/kernels/impl/isclose_kernel_impl.h |    4 +-
 .../phi/kernels/impl/kron_grad_kernel_impl.h  |    4 +-
 paddle/phi/kernels/impl/kron_kernel_impl.h    |    4 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |    4 +-
 .../phi/kernels/impl/polygamma_kernel_impl.h  |    4 +-
 paddle/phi/kernels/impl/pool_kernel_impl.h    |    4 +-
 .../kernels/impl/quant_linear_kernel_impl.h   |    2 +-
 paddle/phi/kernels/impl/renorm_impl.h         |    6 +-
 .../impl/repeat_interleave_grad_kernel_impl.h |   10 +-
 .../impl/repeat_interleave_kernel_impl.h      |    8 +-
 .../kernels/impl/segment_pool_kernel_impl.h   |    7 +-
 .../kernels/impl/sequence_mask_kernel_impl.h  |    4 +-
 .../kernels/impl/set_value_grad_kernel_impl.h |   22 +
 .../phi/kernels/impl/solve_grad_kernel_impl.h |    4 +-
 .../phi/kernels/impl/trace_grad_kernel_impl.h |    4 +-
 .../kernels/impl/unstack_grad_kernel_impl.h   |    4 +-
 paddle/phi/kernels/impl/unstack_kernel_impl.h |    6 +-
 .../phi/kernels/impl/warprnnt_kernel_impl.h   |    2 +-
 .../impl/weight_quantize_kernel_gpu_impl.h    |   11 +-
 paddle/phi/kernels/is_empty_kernel.cc         |    2 +-
 paddle/phi/kernels/kps/elementwise_kernel.cu  |    2 +-
 paddle/phi/kernels/layer_norm_kernel.h        |    2 +-
 paddle/phi/kernels/memcpy_kernel.cc           |    4 +-
 paddle/phi/kernels/npu_identity_kernel.cc     |    2 +-
 .../kernels/primitive/compute_primitives.h    |    6 -
 .../kernels/primitive/datamover_primitives.h  |    5 -
 paddle/phi/kernels/prod_kernel.cc             |    2 +-
 .../phi/kernels/put_along_axis_grad_kernel.h  |    3 +
 paddle/phi/kernels/put_along_axis_kernel.h    |    1 +
 paddle/phi/kernels/reduce_all_kernel.cc       |    2 +-
 paddle/phi/kernels/reduce_amax_kernel.cc      |    2 +-
 paddle/phi/kernels/reduce_amin_kernel.cc      |    2 +-
 paddle/phi/kernels/reduce_any_kernel.cc       |    2 +-
 paddle/phi/kernels/reduce_mean_kernel.cc      |    2 +-
 paddle/phi/kernels/reduce_min_kernel.cc       |    2 +-
 paddle/phi/kernels/reduce_sum_kernel.cc       |    2 +-
 paddle/phi/kernels/reverse_kernel.cc          |    2 +-
 .../selected_rows/activation_kernel.cc        |    2 +-
 .../kernels/selected_rows/assign_kernel.cc    |    2 +-
 .../elementwise_multiply_kernel.cc            |    2 +-
 .../phi/kernels/selected_rows/full_kernel.cc  |    6 +-
 .../kernels/selected_rows/isfinite_kernel.cc  |    4 +-
 .../merge_selected_rows_kernel.cc             |    2 +-
 .../phi/kernels/selected_rows/scale_kernel.cc |    2 +-
 .../phi/kernels/selected_rows/shape_kernel.cc |    2 +-
 .../kernels/selected_rows/uniform_kernel.cc   |    2 +-
 paddle/phi/kernels/set_value_grad_kernel.h    |   10 +
 paddle/phi/kernels/shape_kernel.cc            |    2 +-
 .../kernels/sparse/gpu/softmax_grad_kernel.cu |    3 -
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |    7 -
 paddle/phi/kernels/squeeze_grad_kernel.cc     |    2 +-
 paddle/phi/kernels/squeeze_kernel.cc          |    2 +-
 .../phi/kernels/stride/as_complex_kernel.cc   |    2 +-
 paddle/phi/kernels/stride/as_real_kernel.cc   |    2 +-
 .../phi/kernels/stride/complex_grad_kernel.cc |    2 +-
 paddle/phi/kernels/stride/complex_kernel.cc   |    2 +-
 .../phi/kernels/strided_slice_grad_kernel.cc  |    2 +-
 paddle/phi/kernels/strided_slice_kernel.cc    |    2 +-
 paddle/phi/kernels/strings/case_utils.h       |    2 +-
 paddle/phi/kernels/strings/gpu/copy_utils.h   |   10 +-
 .../kernels/strings/strings_empty_kernel.cc   |    2 +-
 paddle/phi/kernels/strings/unicode.cc         |   10 +-
 paddle/phi/kernels/strings/unicode.h          |    2 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |    4 +-
 paddle/phi/kernels/unsqueeze_grad_kernel.cc   |    2 +-
 paddle/phi/kernels/unsqueeze_kernel.cc        |    2 +-
 .../phi/kernels/xpu/set_value_grad_kernel.cc  |   31 +
 paddle/phi/tools/CMakeLists.txt               |    4 -
 patches/eigen/Complex.h.patch                 |   33 +-
 patches/eigen/Eigen_CORE.patch                |   13 -
 ...c_Core_util_ConfigureVectorization.h.patch |   21 -
 .../eigen/Eigen_src_Core_util_Macros.h.patch  |   51 -
 .../eigen/Eigen_src_Core_util_Meta.h.patch    |   58 -
 patches/eigen/TensorReductionGpu.h            |    2 +-
 .../unsupported_Eigen_CXX11_Tensor.patch      |   13 -
 ...11_src_Tensor_TensorContractionGpu.h.patch |   22 -
 ...X11_src_Tensor_TensorDeviceDefault.h.patch |   15 -
 ...n_CXX11_src_Tensor_TensorDeviceGpu.h.patch |   15 -
 ...src_Tensor_TensorGpuHipCudaDefines.h.patch |   40 -
 ...n_CXX11_src_Tensor_TensorReduction.h.patch |   13 -
 python/CMakeLists.txt                         |    2 -
 python/cinn/compiler/expr_executor.py         |    9 +-
 python/env_dict.py.in                         |    1 -
 python/paddle/__init__.py                     |    1 -
 python/paddle/base/__init__.py                |    1 -
 .../base/dygraph/tensor_patch_methods.py      |   11 +-
 python/paddle/base/executor.py                |   12 +-
 python/paddle/base/framework.py               |   17 +-
 python/paddle/base/layers/math_op_patch.py    |    6 +-
 python/paddle/base/variable_index.py          |  143 +-
 python/paddle/dataset/common.py               |    6 +
 python/paddle/device/__init__.py              |    2 -
 python/paddle/device/cuda/graphs.py           |    3 +-
 python/paddle/distributed/auto_tuner/prune.py |   43 +-
 .../distributed/fleet/base/role_maker.py      |    9 +-
 .../paddle/distributed/fleet/launch_utils.py  |    2 +-
 .../distributed/fleet/layers/mpu/mp_layers.py |    2 +-
 python/paddle/distributed/fleet/utils/fs.py   |   33 +-
 .../fleet/utils/sequence_parallel_utils.py    |    1 -
 .../launch/controllers/collective.py          |   10 +-
 .../paddle/distributed/launch/utils/nvsmi.py  |    2 -
 python/paddle/distributed/rpc/rpc.py          |    4 +-
 .../paddle/distributed/utils/launch_utils.py  |    2 +-
 python/paddle/hapi/hub.py                     |    1 -
 .../incubate/distributed/fleet/fleet_util.py  |   22 +-
 .../paddle/io/dataloader/dataloader_iter.py   |    7 +-
 .../paddle/jit/dy2static/convert_operators.py |    7 +-
 python/paddle/nn/functional/conv.py           |    1 -
 python/paddle/nn/functional/vision.py         |    2 +-
 python/paddle/nn/quant/format.py              |   39 +-
 .../paddle/quantization/observers/__init__.py |    3 +-
 .../quantization/observers/groupwise.py       |  113 ++
 python/paddle/quantization/quantize.py        |   17 +-
 python/paddle/tensor/manipulation.py          |   74 +-
 .../utils/cpp_extension/extension_utils.py    |    1 +
 python/paddle/utils/download.py               |   49 +-
 python/setup.py.in                            |    2 +-
 security/README.md                            |   36 +-
 security/README_cn.md                         |   38 +-
 security/README_ja.md                         |   36 +-
 security/advisory/pdsa-2023-004_cn.md         |    2 +-
 security/advisory/pdsa-2023-006.md            |   31 +
 security/advisory/pdsa-2023-006_cn.md         |   31 +
 security/advisory/pdsa-2023-007.md            |   31 +
 security/advisory/pdsa-2023-007_cn.md         |   31 +
 security/advisory/pdsa-2023-008.md            |   31 +
 security/advisory/pdsa-2023-008_cn.md         |   31 +
 security/advisory/pdsa-2023-009.md            |   31 +
 security/advisory/pdsa-2023-009_cn.md         |   31 +
 security/advisory/pdsa-2023-010.md            |   33 +
 security/advisory/pdsa-2023-010_cn.md         |   33 +
 security/advisory/pdsa-2023-011.md            |   32 +
 security/advisory/pdsa-2023-011_cn.md         |   32 +
 security/advisory/pdsa-2023-012.md            |   35 +
 security/advisory/pdsa-2023-012_cn.md         |   35 +
 security/advisory/pdsa-2023-013.md            |   32 +
 security/advisory/pdsa-2023-013_cn.md         |   32 +
 security/advisory/pdsa-2023-014.md            |   32 +
 security/advisory/pdsa-2023-014_cn.md         |   32 +
 security/advisory/pdsa-2023-015.md            |   33 +
 security/advisory/pdsa-2023-015_cn.md         |   33 +
 security/advisory/pdsa-2023-016.md            |   32 +
 security/advisory/pdsa-2023-016_cn.md         |   32 +
 security/advisory/pdsa-2023-017.md            |   33 +
 security/advisory/pdsa-2023-017_cn.md         |   33 +
 security/advisory/pdsa-2023-018.md            |   32 +
 security/advisory/pdsa-2023-018_cn.md         |   32 +
 security/advisory/pdsa-2023-019.md            |   35 +
 security/advisory/pdsa-2023-019_cn.md         |   35 +
 security/advisory/pdsa-2023-020.md            |   28 +
 security/advisory/pdsa-2023-020_cn.md         |   28 +
 security/advisory/pdsa-2023-021.md            |   33 +
 security/advisory/pdsa-2023-021_cn.md         |   33 +
 security/advisory/pdsa-2023-022.md            |   30 +
 security/advisory/pdsa-2023-022_cn.md         |   30 +
 security/advisory/pdsa-2023-023.md            |   28 +
 security/advisory/pdsa-2023-023_cn.md         |   28 +
 .../hybrid_strategy/CMakeLists.txt            |    2 +-
 test/collective/fleet/CMakeLists.txt          |    4 +-
 .../run_server_for_communicator_half_async.py |   38 +
 .../fleet/test_communicator_half_async.py     |  118 +-
 .../fleet/test_dygraph_sharding_stage2.py     |    9 +-
 .../fleet/test_parallel_dygraph_mp_layers.py  |    5 +-
 .../fleet/test_parallel_dygraph_qat.py        |    2 +-
 test/cpp/fluid/CMakeLists.txt                 |    2 -
 test/cpp/fluid/inference/CMakeLists.txt       |    1 -
 test/cpp/fluid/inference/utils/CMakeLists.txt |   16 -
 .../fluid/inference/utils/io_utils_tester.cc  |  154 --
 test/cpp/fluid/nccl/CMakeLists.txt            |    2 +-
 test/cpp/fluid/nccl/nccl_op_test.cu.cc        |   12 +-
 test/cpp/imperative/CMakeLists.txt            |    3 +-
 test/cpp/imperative/nccl_context_test.cc      |   10 +-
 test/cpp/inference/api/tester_helper.h        |   12 -
 .../inference/api/trt_dynamic_shape_test.cc   |    1 +
 test/cpp/inference/test.cmake                 |    7 +-
 test/custom_runtime/CMakeLists.txt            |    2 +-
 .../test_collective_process_group_xccl.py     |    5 +-
 test/custom_runtime/test_custom_cpu_plugin.py |    3 +-
 .../test_custom_cpu_profiler_plugin.py        |    3 +-
 .../test_custom_cpu_to_static.py              |    3 +-
 test/custom_runtime/test_custom_op_setup.py   |    3 +-
 .../test_fleet_launch_custom_device.sh        |    2 +-
 test/dygraph_to_static/CMakeLists.txt         |   14 +-
 test/dygraph_to_static/test_list.py           |    1 +
 test/dygraph_to_static/test_mobile_net.py     |   11 +-
 test/indexing/test_getitem.py                 |   34 +
 test/indexing/test_setitem.py                 |  130 +-
 test/ir/inference/program_config.py           |   28 +-
 test/ir/inference/test_trt_convert_assign.py  |    5 +-
 test/ir/inference/test_trt_convert_cast.py    |    1 +
 .../test_trt_convert_lookup_table.py          |    1 +
 test/ir/inference/test_trt_convert_solve.py   |    5 +-
 test/legacy_test/CMakeLists.txt               |    4 +-
 test/legacy_test/c_embedding_op_base.py       |   25 +-
 test/legacy_test/test_adaptive_avg_pool1d.py  |    1 -
 test/legacy_test/test_dist_hapi_model.py      |    2 +-
 test/legacy_test/test_download.py             |   15 +-
 .../test_parallel_dygraph_dataparallel.py     |    2 +-
 ...t_parallel_dygraph_dataparallel_cpuonly.py |    2 +-
 test/legacy_test/test_put_along_axis_op.py    |  762 +++++++-
 test/legacy_test/test_repeat_interleave_op.py |   19 +
 test/legacy_test/test_set_value_op.py         |   82 +
 .../test_sparse_fused_attention_op.py         |    5 +
 test/legacy_test/test_yolov3_loss_op.py       |    3 +-
 test/quantization/test_groupwise.py           |   69 +
 test/quantization/test_llm_int8_linear.py     |   90 +-
 ..._post_training_quantization_mobilenetv1.py |   70 +-
 ...est_post_training_quantization_resnet50.py |    2 +-
 test/quantization/test_ptq.py                 |   42 +
 test/quantization/test_weight_only_linear.py  |   42 +
 .../xpu/test_parallel_dygraph_dataparallel.py |    2 +-
 third_party/cryptopp                          |    1 +
 third_party/cryptopp-cmake                    |    1 +
 tools/enforce/grep_invalid_enforce.sh         |    2 +-
 tools/parallel_UT_rule.py                     |    6 -
 915 files changed, 8842 insertions(+), 11920 deletions(-)
 delete mode 100644 cmake/mccl.cmake
 delete mode 100644 cmake/mudnn.cmake
 delete mode 100644 cmake/musa.cmake
 mode change 100755 => 100644 paddle/fluid/inference/tensorrt/op_teller.cc
 delete mode 100644 paddle/fluid/inference/utils/benchmark.cc
 delete mode 100644 paddle/fluid/inference/utils/benchmark.h
 delete mode 100644 paddle/fluid/inference/utils/benchmark_tester.cc
 delete mode 100644 paddle/fluid/inference/utils/table_printer_tester.cc
 delete mode 100644 paddle/fluid/platform/device/gpu/musa/musa_helper.h
 delete mode 100644 paddle/fluid/platform/dynload/mccl.cc
 delete mode 100644 paddle/fluid/platform/dynload/mccl.h
 delete mode 100644 paddle/fluid/platform/dynload/mublas.cc
 delete mode 100644 paddle/fluid/platform/dynload/mublas.h
 delete mode 100644 paddle/fluid/platform/dynload/mudnn.cc
 delete mode 100644 paddle/fluid/platform/dynload/mudnn.h
 delete mode 100644 paddle/fluid/platform/dynload/mufft.cc
 delete mode 100644 paddle/fluid/platform/dynload/mufft.h
 delete mode 100644 paddle/fluid/platform/dynload/murand.cc
 delete mode 100644 paddle/fluid/platform/dynload/murand.h
 delete mode 100644 paddle/fluid/platform/dynload/musa_driver.cc
 delete mode 100644 paddle/fluid/platform/dynload/musa_driver.h
 delete mode 100644 paddle/fluid/platform/dynload/musartc.cc
 delete mode 100644 paddle/fluid/platform/dynload/musartc.h
 delete mode 100644 paddle/fluid/platform/dynload/musparse.cc
 delete mode 100644 paddle/fluid/platform/dynload/musparse.h
 delete mode 100644 paddle/phi/backends/dynload/mccl.cc
 delete mode 100644 paddle/phi/backends/dynload/mccl.h
 delete mode 100644 paddle/phi/backends/dynload/mublas.cc
 delete mode 100644 paddle/phi/backends/dynload/mublas.h
 delete mode 100644 paddle/phi/backends/dynload/mudnn.cc
 delete mode 100644 paddle/phi/backends/dynload/mudnn.h
 delete mode 100644 paddle/phi/backends/dynload/mufft.cc
 delete mode 100644 paddle/phi/backends/dynload/mufft.h
 delete mode 100644 paddle/phi/backends/dynload/murand.cc
 delete mode 100644 paddle/phi/backends/dynload/murand.h
 delete mode 100644 paddle/phi/backends/dynload/musa_driver.cc
 delete mode 100644 paddle/phi/backends/dynload/musa_driver.h
 delete mode 100644 paddle/phi/backends/dynload/musartc.cc
 delete mode 100644 paddle/phi/backends/dynload/musartc.h
 delete mode 100644 paddle/phi/backends/dynload/musparse.cc
 delete mode 100644 paddle/phi/backends/dynload/musparse.h
 delete mode 100644 paddle/phi/backends/gpu/musa/mudnn_desc.h
 delete mode 100644 paddle/phi/backends/gpu/musa/mudnn_helper.h
 delete mode 100644 paddle/phi/backends/gpu/musa/musa_device_function.h
 delete mode 100644 paddle/phi/backends/gpu/musa/musa_helper.h
 delete mode 100644 paddle/phi/backends/gpu/musa/musa_info.cc
 create mode 100644 paddle/phi/kernels/custom/c_embedding_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/custom/c_embedding_kernel.cc
 delete mode 100644 paddle/phi/kernels/funcs/blas/blas_impl.mu.h
 delete mode 100644 paddle/phi/kernels/funcs/mufft_util.h
 delete mode 100644 patches/eigen/Eigen_CORE.patch
 delete mode 100644 patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
 delete mode 100644 patches/eigen/Eigen_src_Core_util_Macros.h.patch
 delete mode 100644 patches/eigen/Eigen_src_Core_util_Meta.h.patch
 delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
 delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
 delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
 delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
 delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
 delete mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch
 create mode 100644 python/paddle/quantization/observers/groupwise.py
 create mode 100644 security/advisory/pdsa-2023-006.md
 create mode 100644 security/advisory/pdsa-2023-006_cn.md
 create mode 100644 security/advisory/pdsa-2023-007.md
 create mode 100644 security/advisory/pdsa-2023-007_cn.md
 create mode 100644 security/advisory/pdsa-2023-008.md
 create mode 100644 security/advisory/pdsa-2023-008_cn.md
 create mode 100644 security/advisory/pdsa-2023-009.md
 create mode 100644 security/advisory/pdsa-2023-009_cn.md
 create mode 100644 security/advisory/pdsa-2023-010.md
 create mode 100644 security/advisory/pdsa-2023-010_cn.md
 create mode 100644 security/advisory/pdsa-2023-011.md
 create mode 100644 security/advisory/pdsa-2023-011_cn.md
 create mode 100644 security/advisory/pdsa-2023-012.md
 create mode 100644 security/advisory/pdsa-2023-012_cn.md
 create mode 100644 security/advisory/pdsa-2023-013.md
 create mode 100644 security/advisory/pdsa-2023-013_cn.md
 create mode 100644 security/advisory/pdsa-2023-014.md
 create mode 100644 security/advisory/pdsa-2023-014_cn.md
 create mode 100644 security/advisory/pdsa-2023-015.md
 create mode 100644 security/advisory/pdsa-2023-015_cn.md
 create mode 100644 security/advisory/pdsa-2023-016.md
 create mode 100644 security/advisory/pdsa-2023-016_cn.md
 create mode 100644 security/advisory/pdsa-2023-017.md
 create mode 100644 security/advisory/pdsa-2023-017_cn.md
 create mode 100644 security/advisory/pdsa-2023-018.md
 create mode 100644 security/advisory/pdsa-2023-018_cn.md
 create mode 100644 security/advisory/pdsa-2023-019.md
 create mode 100644 security/advisory/pdsa-2023-019_cn.md
 create mode 100644 security/advisory/pdsa-2023-020.md
 create mode 100644 security/advisory/pdsa-2023-020_cn.md
 create mode 100644 security/advisory/pdsa-2023-021.md
 create mode 100644 security/advisory/pdsa-2023-021_cn.md
 create mode 100644 security/advisory/pdsa-2023-022.md
 create mode 100644 security/advisory/pdsa-2023-022_cn.md
 create mode 100644 security/advisory/pdsa-2023-023.md
 create mode 100644 security/advisory/pdsa-2023-023_cn.md
 create mode 100644 test/collective/fleet/run_server_for_communicator_half_async.py
 delete mode 100644 test/cpp/fluid/inference/CMakeLists.txt
 delete mode 100644 test/cpp/fluid/inference/utils/CMakeLists.txt
 delete mode 100644 test/cpp/fluid/inference/utils/io_utils_tester.cc
 create mode 100644 test/quantization/test_groupwise.py
 create mode 160000 third_party/cryptopp
 create mode 160000 third_party/cryptopp-cmake

diff --git a/.gitmodules b/.gitmodules
index 8b06f4fb771cbb..0c41450793fc2a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -110,3 +110,11 @@
 	path = third_party/cccl
 	url = https://github.com/NVIDIA/cccl.git
 	ignore = dirty
+[submodule "third_party/cryptopp"]
+	path = third_party/cryptopp
+	url = https://github.com/weidai11/cryptopp.git
+	ignore = dirty
+[submodule "third_party/cryptopp-cmake"]
+	path = third_party/cryptopp-cmake
+	url = https://github.com/noloader/cryptopp-cmake.git
+	ignore = dirty
diff --git a/CMakeLists.txt b/CMakeLists.txt
index da58f0095ae09d..e9f3fafe8d22ad 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,14 +41,13 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 
 project(paddle CXX C)
-# set(CMAKE_VERBOSE_MAKEFILE ON)
+
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
 option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
-option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" OFF)
-option(WITH_MUSA "Compile PaddlePaddle with MUSA" ON)
+option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
 option(WITH_MPI "Compile PaddlePaddle with MPI" OFF)
 option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
 option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
@@ -90,9 +89,6 @@ endif()
 if(WITH_GPU AND WITH_ROCM)
   message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
-if(WITH_GPU AND WITH_MUSA)
-  message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time")
-endif()
 
 if(WITH_GPU AND NOT APPLE)
   enable_language(CUDA)
@@ -256,7 +252,7 @@ option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
 option(WITH_MULTINODE_TESTING "Test multinode apis and ops" OFF)
 option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
 option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
-option(WITH_DISTRIBUTE "Compile with distributed support" ON)
+option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
 option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
 option(ON_INFER "Turn on inference optimization and inference-lib generation"
        ON)
@@ -289,7 +285,6 @@ option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
 option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
 option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
-option(WITH_MCCL "Compile PaddlePaddle with MCCL support" ON)
 option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
 option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
 option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
@@ -357,7 +352,6 @@ endif()
 if(LINUX
    AND NOT WITH_CUSTOM_DEVICE
    AND NOT WITH_GPU
-   AND NOT WITH_MUSA
    AND NOT WITH_ROCM
    AND NOT WITH_XPU
    AND NOT WITH_XPU_KP
@@ -410,14 +404,6 @@ if(NOT WITH_GPU AND WITH_NCCL)
       CACHE STRING "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
-if(NOT WITH_MUSA AND WITH_MCCL)
-  message(
-    WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.")
-  set(WITH_MCCL
-      OFF
-      CACHE STRING "Disable MCCL when compiling without MUSA" FORCE)
-endif()
-
 if(NOT WITH_GPU AND WITH_CUDNN_DSO)
   message(
     WARNING
@@ -475,19 +461,6 @@ else()
   endif()
 endif()
 
-if(WITH_MCCL)
-  add_definitions("-DPADDLE_WITH_MCCL")
-  include(mccl)
-else()
-  if(WITH_MUSA)
-    message(
-      WARNING
-        "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used."
-    )
-  endif()
-endif()
-
-
 if(WITH_BRPC_RDMA)
   message(STATUS "Use brpc with rdma.")
   if(NOT WITH_DISTRIBUTE)
@@ -513,11 +486,6 @@ if(WITH_ROCM)
   include(cupti)
 endif()
 
-if(WITH_MUSA)
-  include(musa)
-  include(mudnn)
-endif()
-
 if(WITH_XPU_KP)
   include(xpu_kp)
 endif()
@@ -530,14 +498,6 @@ if(NOT WITH_ROCM AND WITH_RCCL)
       CACHE STRING "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
-if(NOT WITH_MUSA AND WITH_MCCL)
-  message(
-    WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.")
-  set(WITH_MCCL
-      OFF
-      CACHE STRING "Disable MCCL when compiling without MUSA" FORCE)
-endif()
-
 if(WITH_RCCL)
   add_definitions("-DPADDLE_WITH_RCCL")
   include(rccl)
@@ -550,18 +510,6 @@ else()
   endif()
 endif()
 
-if(WITH_MCCL)
-  add_definitions("-DPADDLE_WITH_MCCL")
-  include(mccl)
-else()
-  if(WITH_MUSA)
-    message(
-      WARNING
-        "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used."
-    )
-  endif()
-endif()
-
 if(WITH_HETERPS AND WITH_PSLIB)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
@@ -612,13 +560,6 @@ if(WITH_RPC)
         OFF
         CACHE BOOL "Disable WITH_RPC when compiling with ROCM" FORCE)
   endif()
-  if(WITH_MUSA AND WITH_RPC)
-  message(
-    WARNING "Disable WITH_RPC when compiling with MUSA. Force WITH_RPC=OFF.")
-  set(WITH_RPC
-      OFF
-      CACHE BOOL "Disable WITH_RPC when compiling with MUSA" FORCE)
-  endif()
   if(WITH_XPU AND WITH_RPC)
     message(
       WARNING "Disable WITH_RPC when compiling with XPU. Force WITH_RPC=OFF.")
@@ -690,12 +631,6 @@ include(configure) # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
-# distribute need openssl
-# openssl install tutorial: https://www.howtoforge.com/tutorial/how-to-install-openssl-from-source-on-linux/
-include_directories("/usr/local/ssl/include")
-link_directories("/usr/local/ssl/lib64")
-
-
 if(WITH_NV_JETSON)
   set(WITH_ARM
       ON
diff --git a/README.md b/README.md
index 8f708334ed28f1..001352ea45fc4d 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### Latest PaddlePaddle Release: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
diff --git a/README_cn.md b/README_cn.md
index a13fa5ba214503..cd45e4e3ecd2b7 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -18,9 +18,9 @@
 
 ## 安装
 
-### PaddlePaddle最新版本: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### PaddlePaddle 最新版本: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
-跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
+跟进 PaddlePaddle 最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
 ### 安装最新稳定版本:
 ```
diff --git a/README_ja.md b/README_ja.md
index 22c78a1a79bbd9..dad60eb7ffcf87 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -20,7 +20,7 @@ PaddlePaddle は、工業化に対するコミットメントを持つ工業的
 
 ## インストール
 
-### PaddlePaddle の最新リリース: [v2.5](https://github.com/PaddlePaddle/Paddle/tree/release/2.5)
+### PaddlePaddle の最新リリース: [v2.6](https://github.com/PaddlePaddle/Paddle/tree/release/2.6)
 
 私たちのビジョンは、PaddlePaddle を通じて、誰もが深層学習を行えるようにすることです。
 PaddlePaddle の最新機能を追跡するために、私たちの[リリースのお知らせ](https://github.com/PaddlePaddle/Paddle/releases)を参照してください。
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 29cca57db65891..dc661fce388fe1 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -175,19 +175,6 @@ elseif(WITH_ROCM)
   if(${MIOPEN_VERSION} VERSION_LESS 2090)
     message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
   endif()
-elseif(WITH_MUSA)
-  add_definitions(-DPADDLE_WITH_MUSA)
-  add_definitions(-DEIGEN_USE_GPU)
-  add_definitions(-DEIGEN_USE_MUSA)
-  if(MUPTI_FOUND)
-    include_directories(${CUPTI_INCLUDE_DIR})
-    add_definitions(-DPADDLE_WITH_MUPTI)
-  else()
-    message(STATUS "Cannot find MUPTI, GPU Profiling is incorrect.")
-  endif()
-  if(NOT MUDNN_FOUND)
-    message(FATAL_ERROR "Paddle needs mudnn to compile")
-  endif()
 else()
   add_definitions(-DHPPL_STUB_FUNC)
   list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
index 5967b468d65ce5..eb7ad44af2313f 100644
--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
@@ -1,4 +1,4 @@
-if(NOT WITH_GPU AND NOT WITH_ROCM AND NOT WITH_MUSA)
+if(NOT WITH_GPU AND NOT WITH_ROCM)
   return()
 endif()
 
@@ -6,10 +6,6 @@ if(WITH_ROCM)
   set(CUPTI_ROOT
       "${ROCM_PATH}/cuda/extras/CUPTI"
       CACHE PATH "CUPTI ROOT")
-elseif(WITH_MUSA)
-  set(CUPTI_ROOT
-      "/usr/local/musa"
-      CACHE PATH "CUPTI ROOT")      
 else()
   set(CUPTI_ROOT
       "/usr"
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index 9daa4be7468e42..b3ec8f622923fd 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -14,12 +14,13 @@
 
 include(ExternalProject)
 
+set(CRYPTOPP_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp)
+set(CRYPTOPP_CMAKE_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cryptopp-cmake)
 set(CRYPTOPP_PREFIX_DIR ${THIRD_PARTY_PATH}/cryptopp)
 set(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp)
 set(CRYPTOPP_INCLUDE_DIR
     "${CRYPTOPP_INSTALL_DIR}/include"
     CACHE PATH "cryptopp include directory." FORCE)
-set(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git)
 set(CRYPTOPP_TAG CRYPTOPP_8_2_0)
 
 if(WIN32)
@@ -63,17 +64,16 @@ include_directories(${CRYPTOPP_INCLUDE_DIR})
 ExternalProject_Add(
   extern_cryptopp
   ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-  GIT_REPOSITORY ${CRYPTOPP_REPOSITORY}
-  GIT_TAG ${CRYPTOPP_TAG}
   PREFIX ${CRYPTOPP_PREFIX_DIR}
+  SOURCE_DIR ${CRYPTOPP_SOURCE_DIR}
   UPDATE_COMMAND ""
   PATCH_COMMAND
-  COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
-  COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
-  COMMAND cd "<SOURCE_DIR>/cmake" && git checkout tags/${CRYPTOPP_TAG} -b
-          ${CRYPTOPP_TAG}
-  COMMAND ${CMAKE_COMMAND} -E copy_directory "<SOURCE_DIR>/cmake/"
-          "<SOURCE_DIR>/"
+  COMMAND ${CMAKE_COMMAND} -E copy "${CRYPTOPP_CMAKE_SOURCE_DIR}/CMakeLists.txt"
+          "<SOURCE_DIR>/CMakeLists.txt"
+  COMMAND
+    ${CMAKE_COMMAND} -E copy
+    "${CRYPTOPP_CMAKE_SOURCE_DIR}/cryptopp-config.cmake"
+    "<SOURCE_DIR>/cryptopp-config.cmake"
   COMMAND ${CRYPTOPP_PATCH_COMMAND}
   INSTALL_DIR ${CRYPTOPP_INSTALL_DIR}
   CMAKE_ARGS ${CRYPTOPP_CMAKE_ARGS}
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 4051a09d767f6b..06e37b3c8a6028 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -60,76 +60,6 @@ if(CMAKE_COMPILER_IS_GNUCC)
         ${EIGEN_PATCH_COMMAND} && patch -Nd
         ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
   endif()
-  if(WITH_MUSA)
-    file(
-      TO_NATIVE_PATH
-      ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
-      configure_vectorization_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/
-        < ${configure_vectorization_header})
-    file(TO_NATIVE_PATH
-         ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Macros.h.patch
-         util_macros_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/
-        < ${util_macros_header})
-    file(TO_NATIVE_PATH
-         ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Meta.h.patch
-         meta_header)
-    set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd
-                            ${SOURCE_DIR}/Eigen/src/Core/util/ < ${meta_header})
-    file(TO_NATIVE_PATH
-         ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
-         cxx11_tensor)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/unsupported/Eigen/CXX11/ < ${cxx11_tensor})
-    file(
-      TO_NATIVE_PATH
-      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
-      tensor_contraction_gpu_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
-        ${tensor_contraction_gpu_header})
-    file(
-      TO_NATIVE_PATH
-      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
-      tensor_device_default_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
-        ${tensor_device_default_header})
-    file(
-      TO_NATIVE_PATH
-      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
-      tensor_gpu_hip_cuda_defines_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
-        ${tensor_gpu_hip_cuda_defines_header})
-    file(
-      TO_NATIVE_PATH
-      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch
-      tensor_reduction_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
-        ${tensor_reduction_header})
-    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_CORE.patch
-         eigen_core)
-    set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd
-                            ${SOURCE_DIR}/Eigen/ < ${eigen_core})
-    file(
-      TO_NATIVE_PATH
-      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
-      tensor_device_gpu_header)
-    set(EIGEN_PATCH_COMMAND
-        ${EIGEN_PATCH_COMMAND} && patch -Nd
-        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
-        ${tensor_device_gpu_header})
-  endif()
 endif()
 
 set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 8d6384d2f0a141..7a4956e6e15567 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -246,11 +246,6 @@ if(WITH_GPU)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 endif()
 
-if(WITH_MUSA)
-  set(CMAKE_MUSA_FLAGS "${CMAKE_MUSA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
-endif()
-
-
 if(WITH_ROCM)
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 788237cc4699b4..c463dbc6064e12 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -453,9 +453,6 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
-  if(WITH_MUSA)
-    target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
-  endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
@@ -484,12 +481,6 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
-    if(WITH_MUSA)
-      target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
-      # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/'
-      target_link_options(${TARGET_NAME} PRIVATE
-                          -Wl,-rpath,/usr/lib/x86_64-linux-gnu/)
-    endif(())
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
 endfunction()
@@ -628,12 +619,6 @@ function(paddle_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
-    if(WITH_MUSA)
-      target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
-      # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/'
-      target_link_options(${TARGET_NAME} PRIVATE
-                          -Wl,-rpath,/usr/lib/x86_64-linux-gnu/)
-    endif()
     if(APPLE)
       target_link_libraries(
         ${TARGET_NAME}
@@ -765,115 +750,6 @@ function(nv_test TARGET_NAME)
   endif()
 endfunction()
 
-
-
-function(musa_library TARGET_NAME)
-  if(WITH_MUSA)
-    set(options STATIC static SHARED shared)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN})
-    if(musa_library_SRCS)
-      if(musa_library_SHARED OR musa_library_shared) # build *.so
-        musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
-      else()
-        musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
-        find_fluid_modules(${TARGET_NAME})
-        find_phi_modules(${TARGET_NAME})
-      endif()
-      if(musa_library_DEPS)
-        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
-        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
-      endif()
-      # cpplint code style
-      foreach(source_file ${musa_library_SRCS})
-        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
-        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-          list(APPEND musa_library_HEADERS
-               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
-        endif()
-      endforeach()
-    else()
-      if(musa_library_DEPS)
-        list(REMOVE_DUPLICATES musa_library_DEPS)
-        generate_dummy_static_lib(
-          LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
-          "generic.cmake:musa_library")
-
-        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
-        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
-      else()
-        message(FATAL "Please specify source file or library in musa_library.")
-      endif()
-    endif()
-  endif()
-endfunction()
-
-function(musa_binary TARGET_NAME)
-  if(WITH_MUSA)
-    set(options "")
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN})
-    add_executable(${TARGET_NAME} ${musa_binary_SRCS})
-    if(musa_binary_DEPS)
-      target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS})
-      add_dependencies(${TARGET_NAME} ${musa_binary_DEPS})
-      common_link(${TARGET_NAME})
-    endif()
-  endif()
-endfunction()
-
-function(musa_test TARGET_NAME)
-  if(WITH_MUSA AND WITH_TESTING)
-    set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}"
-                          "${multiValueArgs}" ${ARGN})
-    musa_add_executable(${TARGET_NAME} ${musa_test_SRCS})
-    # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
-    target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
-    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-    target_link_libraries(
-      ${TARGET_NAME}
-      ${musa_test_DEPS}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      glog
-      phi
-      ${os_dependency_modules})
-    add_dependencies(
-      ${TARGET_NAME}
-      ${musa_test_DEPS}
-      paddle_gtest_main
-      lod_tensor
-      memory
-      gtest
-      phi
-      glog)
-    common_link(${TARGET_NAME})
-    add_test(${TARGET_NAME} ${TARGET_NAME})
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_cpu_deterministic=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_init_allocated_mem=true)
-    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
-                                              FLAGS_cudnn_deterministic=true)
-    set_property(
-      TEST ${TARGET_NAME}
-      PROPERTY
-        ENVIRONMENT
-        "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH"
-    )
-  endif()
-endfunction()
-
-
-
 function(hip_library TARGET_NAME)
   if(WITH_ROCM)
     set(options STATIC static SHARED shared)
@@ -882,12 +758,6 @@ function(hip_library TARGET_NAME)
     cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}"
                           "${multiValueArgs}" ${ARGN})
     if(hip_library_SRCS)
-      # FindHIP.cmake defined hip_add_library, HIP_SOURCE_PROPERTY_FORMAT is requried if no .cu files found
-      if(NOT (${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/operators"
-              OR ${CMAKE_CURRENT_SOURCE_DIR} MATCHES ".*/phi/kernels"))
-        set_source_files_properties(${hip_library_SRCS}
-                                    PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-      endif()
       if(hip_library_SHARED OR hip_library_shared) # build *.so
         hip_add_library(${TARGET_NAME} SHARED ${hip_library_SRCS})
       else()
@@ -901,6 +771,10 @@ function(hip_library TARGET_NAME)
       endif()
       # cpplint code style
       foreach(source_file ${hip_library_SRCS})
+        if(NOT ${source_file} MATCHES "\\.cu$")
+          set_source_files_properties(${source_file}
+                                      PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+        endif()
         string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
         if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
           list(APPEND hip_library_HEADERS
@@ -1501,15 +1375,6 @@ function(math_library TARGET)
       ${TARGET}
       SRCS ${cc_srcs} ${cu_srcs}
       DEPS ${math_library_DEPS} ${math_common_deps})
-  elseif(WITH_MUSA)
-    musa_library(
-      ${TARGET}
-      SRCS
-      ${cc_srcs}
-      ${cu_srcs}
-      DEPS
-      ${math_library_DEPS}
-      ${math_common_deps})
   elseif(${cc_srcs_len} GREATER 0)
     cc_library(
       ${TARGET}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 06dc5d6173794a..517ac24cccc72e 100755
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -237,6 +237,16 @@ copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 
+if(WIN32)
+  set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/common.*)
+else()
+  set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*)
+endif()
+copy(
+  inference_lib_dist
+  SRCS ${paddle_common_lib}
+  DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
+
 if(WIN32)
   if(WITH_STATIC_LIB)
     set(paddle_inference_lib
@@ -268,11 +278,6 @@ else()
       SRCS ${paddle_phi_lib}
       DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
   endif()
-  set(paddle_common_lib ${PADDLE_BINARY_DIR}/paddle/common/libcommon.*)
-  copy(
-    inference_lib_dist
-    SRCS ${paddle_common_lib}
-    DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/lib)
 endif()
 
 copy(
diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake
deleted file mode 100644
index 5ce4ea9c25fec0..00000000000000
--- a/cmake/mccl.cmake
+++ /dev/null
@@ -1,51 +0,0 @@
-if(NOT WITH_MUSA)
-  return()
-endif()
-
-# Now we don't support MCCL on windows
-if(WIN32)
-  return()
-endif()
-
-if(WITH_MCCL)
-  set(MCCL_ROOT
-      "/usr/local/musa/"
-      CACHE PATH "MCCL ROOT")
-  find_path(
-    MCCL_INCLUDE_DIR mccl.h
-    PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include
-          $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include
-    NO_DEFAULT_PATH)
-
-  if(MCCL_INCLUDE_DIR)
-    file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS)
-
-    string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION
-                 "${MCCL_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION
-                         "${MCCL_MAJOR_VERSION}")
-    string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION
-                 "${MCCL_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION
-                         "${MCCL_MINOR_VERSION}")
-    string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION
-                 "${MCCL_VERSION_FILE_CONTENTS}")
-    string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION
-                         "${MCCL_PATCH_VERSION}")
-    if(NOT MCCL_MAJOR_VERSION)
-      set(MCCL_VERSION "???")
-    else()
-      math(EXPR MCCL_VERSION "${MCCL_MAJOR_VERSION} * 1000 +
-                 ${MCCL_MINOR_VERSION} * 100 + ${MCCL_PATCH_VERSION}")
-    endif()
-    include_directories(${MCCL_INCLUDE_DIR})
-
-    message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ")
-    message(
-      STATUS
-        "Current MCCL version is "
-        "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ")
-  else()
-    message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!")
-  endif()
-endif()
diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake
deleted file mode 100644
index 81027890d144e3..00000000000000
--- a/cmake/mudnn.cmake
+++ /dev/null
@@ -1,92 +0,0 @@
-if(NOT WITH_MUSA)
-  return()
-endif()
-
-if(WIN32)
-  return()
-else()
-  set(MUDNN_ROOT
-      "/usr/local/musa"
-      CACHE PATH "MUDNN ROOT")
-endif()
-
-find_path(
-  MUDNN_INCLUDE_DIR mudnn.h
-  PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT}
-        $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE}
-  NO_DEFAULT_PATH)
-
-set(TARGET_ARCH "x86_64")
-if(NOT ${CMAKE_SYSTEM_PROCESSOR})
-  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
-endif()
-
-list(
-  APPEND
-  MUDNN_CHECK_LIBRARY_DIRS
-  ${MUDNN_ROOT}
-  ${MUDNN_ROOT}/lib64
-  ${MUDNN_ROOT}/lib
-  ${MUDNN_ROOT}/lib/x64
-  ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-  $ENV{MUDNN_ROOT}
-  $ENV{MUDNN_ROOT}/lib64
-  $ENV{MUDNN_ROOT}/lib
-  $ENV{MUDNN_ROOT}/lib/x64
-  /usr/lib
-  ${MUSA_TOOLKIT_ROOT_DIR}
-  ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64)
-set(MUDNN_LIB_NAME "")
-
-if(LINUX)
-  set(MUDNN_LIB_NAME "libmudnn.so")
-endif()
-
-find_library(
-  MUDNN_LIBRARY
-  NAMES ${MUDNN_LIB_NAME}
-  PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR}
-  NO_DEFAULT_PATH
-  DOC "Path to muDNN library.")
-
-if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY)
-  set(MUDNN_FOUND ON)
-else()
-  set(MUDNN_FOUND OFF)
-endif()
-
-macro(find_mudnn_version mudnn_version_file)
-  file(READ ${mudnn_version_file} MUDNN_VERSION_FILE_CONTENTS)
-  get_filename_component(MUDNN_LIB_PATH ${MUDNN_LIBRARY} DIRECTORY)
-
-  string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION
-               "${MUDNN_VERSION_FILE_CONTENTS}")
-  string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1"
-                       MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}")
-  string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION
-               "${MUDNN_VERSION_FILE_CONTENTS}")
-  string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1"
-                       MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}")
-  string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION
-               "${MUDNN_VERSION_FILE_CONTENTS}")
-  string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1"
-                       MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}")
-
-  if(NOT MUDNN_MAJOR_VERSION)
-    set(MUDNN_VERSION "???")
-  else()
-    add_definitions("-DMUDNN_MAJOR_VERSION=\"${MUDNN_MAJOR_VERSION}\"")
-    math(EXPR MUDNN_VERSION "${MUDNN_MAJOR_VERSION} * 1000 +
-               ${MUDNN_MINOR_VERSION} * 100 + ${MUDNN_PATCH_VERSION}")
-    message(STATUS "Current muDNN version file is ${mudnn_version_file} ")
-    message(
-      STATUS
-        "Current muDNN version is v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}.${MUDNN_PATCH_VERSION}. "
-    )
-  endif()
-endmacro()
-
-if(MUDNN_FOUND)
-  find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h)
-  include_directories(${MUDNN_INCLUDE_DIR})
-endif()
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
deleted file mode 100644
index 63a85e827061cf..00000000000000
--- a/cmake/musa.cmake
+++ /dev/null
@@ -1,128 +0,0 @@
-if(NOT WITH_MUSA)
-  return()
-endif()
-
-if(NOT DEFINED ENV{MUSA_PATH})
-  set(MUSA_PATH
-      "/usr/local/musa"
-      CACHE PATH "Path to which ROCm has been installed")
-else()
-  set(MUSA_PATH
-      $ENV{MUSA_PATH}
-      CACHE PATH "Path to which ROCm has been installed")
-endif()
-set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH})
-
-find_package(MUSA REQUIRED)
-include_directories(${MUSA_PATH}/include)
-
-# set openmp include directory
-set(llvm_openmp_search_list)
-foreach(item RANGE 6 20 1)
-  list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/)
-endforeach()
-
-find_path(
-  OPENMP_INCLUDE_DIR omp.h
-  PATHS ${llvm_openmp_search_list} REQUIRED
-  NO_DEFAULT_PATH)
-include_directories(${OPENMP_INCLUDE_DIR})
-
-macro(find_musa_version musa_version_file)
-  set(python_file ${PROJECT_BINARY_DIR}/get_version.py)
-  set(MUSA_VERSION
-      "None"
-      CACHE STRING "musa version" FORCE)
-  file(
-    WRITE ${python_file}
-    ""
-    "import json\n"
-    "import sys\n"
-    "with open(sys.argv[1], 'r') as f:\n"
-    "    data = json.load(f)\n"
-    "    print(data[\"musa_runtime\"][\"version\"])"
-    "")
-
-  execute_process(
-    COMMAND "python" "${python_file}" ${musa_version_file}
-    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
-    RESULT_VARIABLE python_res
-    OUTPUT_VARIABLE python_out
-    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
-
-  if(python_res EQUAL 0)
-    set(MUSA_VERSION ${python_out})
-  endif()
-  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION
-                       "${MUSA_VERSION}")
-  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION
-                       "${MUSA_VERSION}")
-  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION
-                       "${MUSA_VERSION}")
-
-  if(NOT MUSA_MAJOR_VERSION)
-    set(MUSA_VERSION "???")
-    message(WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json")
-  else()
-    math(
-      EXPR
-      MUSA_VERSION
-      "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100   + ${MUSA_PATCH_VERSION}"
-    )
-    message(STATUS "Current MUSA version file is ${MUSA_PATH}/version.json.")
-    message(
-      STATUS
-        "Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} "
-    )
-  endif()
-endmacro()
-find_musa_version(${MUSA_PATH}/version.json)
-
-list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
-list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy)
-list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header)
-list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
-list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare)
-list(APPEND MUSA_MCC_FLAGS -Wno-overloaded-virtual)
-list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags)
-list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move)
-list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable)
-list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical)
-list(APPEND MUSA_MCC_FLAGS -Wno-format)
-list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
-list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
-list(APPEND MUSA_MCC_FLAGS -Wno-literal-range)
-list(APPEND MUSA_MCC_FLAGS -Wno-unused-private-field)
-list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
-list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
-list(APPEND MUSA_MCC_FLAGS -Wno-unused-value)
-list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
-list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
-list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
-list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init)
-list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed)
-list(APPEND MUSA_MCC_FLAGS -Wno-missing-braces)
-list(APPEND MUSA_MCC_FLAGS -Wno-dangling-gsl)
-
-if(WITH_CINN)
-  list(APPEND MUSA_MCC_FLAGS -std=c++14)
-else()
-  list(APPEND MUSA_MCC_FLAGS -std=c++17)
-endif()
-
-list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_22)
-list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
-# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
-list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__)
-
-#set(MUSA_VERBOSE_BUILD ON)
-if(CMAKE_BUILD_TYPE MATCHES Debug)
-  list(APPEND MUSA_MCC_FLAGS -g2)
-  list(APPEND MUSA_MCC_FLAGS -O0)
-else()
-  list(APPEND MUSA_MCC_FLAGS -O2)
-endif()
-
-set(musa_runtime_library_name musart)
-find_library(MUSARTC_LIB ${musa_runtime_library_name} HINTS ${MUSA_PATH}/lib)
-message(STATUS "MUSARTC_LIB: ${MUSARTC_LIB}")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 60966c41e95b93..95273118c25057 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,11 +84,6 @@ function(register_cu_kernel TARGET)
       ${TARGET}
       SRCS ${cu_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
-  elseif(WITH_MUSA)
-    musa_library(
-      ${TARGET}
-      SRCS ${cu_srcs}
-      DEPS ${op_library_DEPS} ${op_common_deps})      
   endif()
   set(OP_LIBRARY
       ${TARGET} ${OP_LIBRARY}
@@ -156,18 +151,14 @@ function(op_library TARGET)
   set(cc_srcs)
   set(cu_srcs)
   set(hip_srcs)
-  set(mu_srcs)
   set(cu_cc_srcs)
   set(hip_cc_srcs)
-  set(mu_cc_srcs)
   set(xpu_cc_srcs)
   set(xpu_kp_cc_srcs)
   set(cudnn_cu_cc_srcs)
   set(miopen_cu_cc_srcs)
-  set(mudnn_cu_cc_srcs)
   set(cudnn_cu_srcs)
   set(miopen_cu_srcs)
-  set(mudnn_cu_srcs)
   set(CUDNN_FILE)
   set(MIOPEN_FILE)
   set(mkldnn_cc_srcs)
@@ -246,35 +237,6 @@ function(op_library TARGET)
         list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu)
       endif()
     endif()
-    if(WITH_MUSA)
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-        list(APPEND mu_cc_srcs ${TARGET}.cu.cc)
-      endif()
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-        list(APPEND mu_srcs ${TARGET}.cu)
-      endif()
-      # rename in KP: .kps -> .cu
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
-        file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
-        file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps
-             ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
-        list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
-      endif()
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-        set(PART_CUDA_KERNEL_FILES
-            ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
-            ${PART_CUDA_KERNEL_FILES}
-            PARENT_SCOPE)
-        list(APPEND mu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
-      endif()
-      string(REPLACE "_op" "_cudnn_op" MUDNN_FILE "${TARGET}")
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu.cc)
-        list(APPEND mudnn_cu_cc_srcs ${MUDNN_FILE}.cu.cc)
-      endif()
-      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu)
-        list(APPEND mudnn_cu_srcs ${MUDNN_FILE}.cu)
-      endif()
-    endif()    
     if(WITH_MKLDNN)
       string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
       if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
@@ -305,14 +267,6 @@ function(op_library TARGET)
         list(APPEND miopen_cu_cc_srcs ${src})
       elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$")
         list(APPEND hip_cc_srcs ${src})
-      elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu$")
-        list(APPEND mudnn_cu_srcs ${src})
-      elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu$")
-        list(APPEND mu_srcs ${src})
-      elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
-        list(APPEND mudnn_cu_cc_srcs ${src})
-      elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu.cc$")
-        list(APPEND mu_cc_srcs ${src})
       elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$")
         list(APPEND cudnn_cu_srcs ${src})
       elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu$")
@@ -331,15 +285,13 @@ function(op_library TARGET)
         list(APPEND xpu_kp_cc_srcs ${src})
       elseif(${src} MATCHES ".*\\.cc$")
         list(APPEND cc_srcs ${src})
-      elseif((WITH_ROCM OR WITH_GPU OR WITH_MUSA) AND ${src} MATCHES ".*\\.kps$")
+      elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$")
         string(REPLACE ".kps" ".cu" src_cu ${src})
         file(COPY ${src} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
         file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${src}
              ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
         if(WITH_ROCM)
           list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
-        elseif(WITH_MUSA)
-          list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
         else()
           list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
         endif()
@@ -439,26 +391,6 @@ function(op_library TARGET)
       SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs}
            ${mkldnn_cc_srcs} ${hip_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
-  elseif(WITH_MUSA)
-    list(REMOVE_ITEM mudnn_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
-    list(REMOVE_ITEM mudnn_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
-    list(REMOVE_ITEM mu_srcs "cholesky_op.cu")
-    list(REMOVE_ITEM mu_srcs "cholesky_solve_op.cu")
-    list(REMOVE_ITEM mu_srcs "lu_op.cu")
-    list(REMOVE_ITEM mu_srcs "matrix_rank_op.cu")
-    list(REMOVE_ITEM mu_srcs "svd_op.cu")
-    list(REMOVE_ITEM mu_srcs "eigvalsh_op.cu")
-    list(REMOVE_ITEM mu_srcs "qr_op.cu")
-    list(REMOVE_ITEM mu_srcs "eigh_op.cu")
-    list(REMOVE_ITEM mu_srcs "lstsq_op.cu")
-    list(REMOVE_ITEM mu_srcs "multinomial_op.cu")
-    list(REMOVE_ITEM mu_srcs "multiclass_nms3_op.cu")
-    message(STATUS "mu_cc_srcs: ${mu_cc_srcs}, cc_srcs: ${cc_srcs}")
-    musa_library(
-      ${TARGET}
-      SRCS ${cc_srcs} ${mu_cc_srcs} ${mudnn_cu_cc_srcs} ${mudnn_cu_srcs}
-           ${mkldnn_cc_srcs} ${mu_srcs}
-      DEPS ${op_library_DEPS} ${op_common_deps})      
   elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
     xpu_library(
       ${TARGET}
@@ -492,10 +424,8 @@ function(op_library TARGET)
 
   list(LENGTH cu_srcs cu_srcs_len)
   list(LENGTH hip_srcs hip_srcs_len)
-  list(LENGTH mu_srcs mu_srcs_len)
   list(LENGTH cu_cc_srcs cu_cc_srcs_len)
   list(LENGTH hip_cc_srcs hip_cc_srcs_len)
-  list(LENGTH mu_cc_srcs mu_cc_srcs_len)
   list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
   list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
   list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
@@ -606,30 +536,12 @@ function(op_library TARGET)
     endif()
   endforeach()
 
-  # pybind USE_OP_DEVICE_KERNEL for MUSA
-  list(APPEND mu_srcs ${mu_cc_srcs})
-  message("mu_srcs ${mu_srcs}")
-  foreach(mu_src ${mu_srcs})
-    set(op_name "")
-    find_register(${mu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
-    find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_KERNEL")
-    find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
-    find_phi_register(${mu_src} ${pybind_file}
-                      "PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
-    if(NOT ${op_name} EQUAL "")
-      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
-      set(pybind_flag 1)
-    endif()
-  endforeach()
-
-
   # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN
   list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs})
   list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs})
   list(APPEND cudnn_cu_srcs ${miopen_cu_srcs})
-  list(APPEND cudnn_cu_srcs ${mudnn_cu_cc_srcs})
-  list(APPEND cudnn_cu_srcs ${mudnn_cu_srcs})  
   list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
+  #message("cudnn_cu_srcs ${cudnn_cu_srcs}")
   if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL
                                         "activation_op")
     file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
@@ -744,7 +656,7 @@ function(register_operators)
   string(REPLACE ".cc" "" OPS "${OPS}")
   list(REMOVE_DUPLICATES OPS)
   list(LENGTH register_operators_DEPS register_operators_DEPS_len)
-  message(STATUS "OPS in register_operators:${OPS}")
+
   foreach(src ${OPS})
     list(FIND register_operators_EXCLUDES ${src} _index)
     if(${_index} EQUAL -1)
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index 499cc4c591bbfc..ead66697ef68cb 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -104,7 +104,7 @@ function(kernel_declare TARGET_LIST)
         endif()
       endif()
       # some gpu kernel only can run on cuda, not support rocm, so we add this branch
-      if(WITH_ROCM OR WITH_MUSA)
+      if(WITH_ROCM)
         string(FIND "${first_registry}" "cuda_only" pos)
         if(pos GREATER 1)
           set(first_registry "")
diff --git a/paddle/cinn/ir/ir_base.h b/paddle/cinn/ir/ir_base.h
index c333448d029ae0..0047100ebcfdfc 100644
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
@@ -110,16 +110,23 @@ class Dim;
   macro__(Product)                          \
   macro__(Sum)                              \
   macro__(PrimitiveNode)                    \
-  macro__(IntrinsicOp)                      \
   macro__(_BufferRange_)                    \
   macro__(ScheduleBlock)                    \
   macro__(ScheduleBlockRealize)             \
   macro__(_Dim_)                            \
 
+#define NODETY_CONTROL_OP_FOR_INTRINSIC(macro__) \
+  macro__(IntrinsicOp)                      \
 
 #define NODETY_FORALL(__m)              \
   NODETY_PRIMITIVE_TYPE_FOR_EACH(__m)   \
   NODETY_OP_FOR_EACH(__m)               \
+  NODETY_CONTROL_OP_FOR_INTRINSIC(__m)  \
+  NODETY_CONTROL_OP_FOR_EACH(__m)
+
+#define NODETY_FORALL_EXCEPT_INTRINSIC(__m)              \
+  NODETY_PRIMITIVE_TYPE_FOR_EACH(__m)                    \
+  NODETY_OP_FOR_EACH(__m)                                \
   NODETY_CONTROL_OP_FOR_EACH(__m)
 // clang-format on
 
diff --git a/paddle/cinn/ir/utils/ir_nodes_collector.cc b/paddle/cinn/ir/utils/ir_nodes_collector.cc
index ac2f0317e9213f..e4ebaca653bae9 100644
--- a/paddle/cinn/ir/utils/ir_nodes_collector.cc
+++ b/paddle/cinn/ir/utils/ir_nodes_collector.cc
@@ -15,6 +15,8 @@
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
 #include <glog/logging.h>
 
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_mutator.h"
 #include "paddle/cinn/ir/ir_printer.h"
 
@@ -71,8 +73,71 @@ struct IrNodesCollector : public IRVisitorRequireReImpl<void> {
     }                                  \
   }
 
-  NODETY_FORALL(__m)
+  NODETY_FORALL_EXCEPT_INTRINSIC(__m)
 #undef __m
+
+  void Visit(const ir::IntrinsicOp* op) {
+    switch (op->getKind()) {
+#define __(x)                                     \
+  case ir::IntrinsicKind::k##x:                   \
+    Visit(llvm::dyn_cast<ir::intrinsics::x>(op)); \
+    break;
+
+      INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+    }
+  }
+
+  void Visit(const ir::intrinsics::GetAddr* x) {
+    if (x->data.defined()) {
+      Visit(&(x->data));
+    }
+  }
+
+  void Visit(const ir::intrinsics::BufferGetDataHandle* x) {
+    if (x->buffer.defined()) {
+      Visit(&(x->buffer));
+    }
+  }
+
+  void Visit(const ir::intrinsics::BufferGetDataConstHandle* x) {
+    if (x->buffer.defined()) {
+      Visit(&(x->buffer));
+    }
+  }
+
+  void Visit(const ir::intrinsics::PodValueToX* x) {
+    if (x->pod_value_ptr.defined()) {
+      Visit(&(x->pod_value_ptr));
+    }
+  }
+
+  void Visit(const ir::intrinsics::BufferCreate* x) {
+    if (x->buffer.defined()) {
+      Visit(&(x->buffer));
+    }
+  }
+
+  void Visit(const ir::intrinsics::ArgsConstruct* x) {
+    if (x->var.defined()) {
+      Expr convert = Expr(x->var);
+      Visit(&convert);
+    }
+    for (int i = 0; i < x->args.size(); ++i) {
+      if (x->args[i].defined()) {
+        Visit(&(x->args[i]));
+      }
+    }
+  }
+
+  void Visit(const ir::intrinsics::BuiltinIntrin* x) {
+    for (int i = 0; i < x->args.size(); ++i) {
+      if (x->args[i].defined()) {
+        Visit(&(x->args[i]));
+      }
+    }
+  }
+
   std::set<void*> visited_;
 };
 
diff --git a/paddle/common/array.h b/paddle/common/array.h
index 20f7904fc3bd19..11457a1eaa756b 100644
--- a/paddle/common/array.h
+++ b/paddle/common/array.h
@@ -54,7 +54,7 @@ class Array {
   }
 
   HOSTDEVICE inline T &at(size_t i) {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
     COMMON_ENFORCE_LT(
         i, N, common::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -62,7 +62,7 @@ class Array {
   }
 
   HOSTDEVICE inline const T &at(size_t i) const {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
     COMMON_ENFORCE_LT(
         i, N, common::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -103,7 +103,7 @@ class Array<T, 0> {
   HOSTDEVICE inline T *GetMutable() { return nullptr; }
 
   HOSTDEVICE inline T &operator[](size_t) {
-#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static T obj{};
@@ -114,7 +114,7 @@ class Array<T, 0> {
   }
 
   HOSTDEVICE inline const T &operator[](size_t) const {
-#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static const T obj{};
diff --git a/paddle/common/hostdevice.h b/paddle/common/hostdevice.h
index f7070893d83b58..7f8cf135634341 100644
--- a/paddle/common/hostdevice.h
+++ b/paddle/common/hostdevice.h
@@ -18,10 +18,6 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef __MUSACC__
-#include <musa_runtime.h>
-#endif
-
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
@@ -30,7 +26,7 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/common/macros.h b/paddle/common/macros.h
index 8189b3147db8cc..2d476c58cb6ae1 100644
--- a/paddle/common/macros.h
+++ b/paddle/common/macros.h
@@ -72,7 +72,7 @@ namespace common {
 #define PD_CONCATENATE2(arg1, arg2) arg1##arg2
 #define PD_EXPAND(x) x
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #define PADDLE_RESTRICT __restrict__
 #else
 #define PADDLE_RESTRICT
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index dd6309f7da3608..d42b810972dc85 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -15,7 +15,7 @@ if(WITH_DISTRIBUTE)
     DEPS phi common eager_api gloo_wrapper)
 endif()
 
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     process_group_nccl
     SRCS process_group_nccl.cc common.cc
@@ -63,7 +63,7 @@ if(WITH_CUSTOM_DEVICE)
 endif()
 
 set(COMM_UTILS_DEPS process_group)
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
 endif()
 if(WITH_CUSTOM_DEVICE)
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
index dd3e1f410ee0d2..6732ea375d500e 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -106,8 +106,6 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
     // If we use the work to do barrier, we should block cpu
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
@@ -139,20 +137,18 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
 }
 
 void ProcessGroupNCCL::GroupStart() {
-  MCCL_CHECK(phi::dynload::mcclGroupStart());
+  NCCL_CHECK(phi::dynload::ncclGroupStart());
   ++s_group_call_counter;
 }
 
 void ProcessGroupNCCL::GroupEnd() {
-  MCCL_CHECK(phi::dynload::mcclGroupEnd());
+  NCCL_CHECK(phi::dynload::ncclGroupEnd());
   --s_group_call_counter;
   // NOTE: This is to sync the calc stream and comm stream for debug using
   // batch_isend_irecv
   if (FLAGS_benchmark || FLAGS_benchmark_nccl) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
@@ -183,7 +179,7 @@ phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
   }
 }
 
-mcclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
+ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
   const std::string& key = GetKeyFromPlace(place);
   const auto& iter = place_to_comm_ctx_.find(key);
   PADDLE_ENFORCE_NE(
@@ -208,7 +204,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
       numel > 0 ? GetPartialTensor(tensor_tmp, offset, numel) : tensor_tmp;
   return Collective(
       [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) {
-        VLOG(3) << "[mcclAllGather] "
+        VLOG(3) << "[ncclAllGather] "
                 << "sendbuff: " << in_tensor_maybe_partial.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << in_tensor_maybe_partial.numel()
@@ -239,7 +235,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
       paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor);
   return Collective(
       [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) {
-        VLOG(3) << "[mcclAllReduce] "
+        VLOG(3) << "[ncclAllReduce] "
                 << "sendbuff: " << tensor_tmp.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << tensor_tmp.numel() << ", datatype: "
@@ -708,7 +704,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
           << ", store_key: " << store_key;
 
   for (size_t i = 0; i < s_group_call_counter; ++i) {
-    MCCL_CHECK(phi::dynload::mcclGroupEnd());
+    NCCL_CHECK(phi::dynload::ncclGroupEnd());
   }
 
   bool is_batch_p2p = s_group_call_counter > 0;
@@ -717,13 +713,13 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
   int num_ranks = is_p2p_op ? 2 : GetSize();
   int rank = is_p2p_op ? p2p_rank : GetRank();
 
-  MCCL_CHECK(phi::dynload::mcclGroupStart());
+  NCCL_CHECK(phi::dynload::ncclGroupStart());
 
   phi::distributed::P2POption p2p_opts({is_p2p_op, p2p_rank, num_ranks, rank});
   phi::distributed::CommContextManager::CreateNCCLCommContext(
       store_, store_key, rank_, size_, "", &p2p_opts);
 
-  MCCL_CHECK(phi::dynload::mcclGroupEnd());
+  NCCL_CHECK(phi::dynload::ncclGroupEnd());
 
   auto nccl_comm_ctx = this->GetCommContext(&store_key);
   VLOG(3) << "Get nccl comm: " << nccl_comm_ctx->GetNcclComm()
@@ -751,10 +747,10 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
         phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
         gpu_global_ranks_size);
 
-    MCCL_CHECK(phi::dynload::mcclAllGather(gpu_global_rank->ptr(),
+    NCCL_CHECK(phi::dynload::ncclAllGather(gpu_global_rank->ptr(),
                                            gpu_global_ranks->ptr(),
                                            1,
-                                           mcclInt,
+                                           ncclInt,
                                            nccl_comm_ctx->GetNcclComm(),
                                            comm_ctx->stream()));
 
@@ -787,7 +783,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
   place_to_comm_ctx_.emplace(place_key, std::move(comm_ctx));
 
   for (size_t i = 0; i < s_group_call_counter; ++i) {
-    MCCL_CHECK(phi::dynload::mcclGroupStart());
+    NCCL_CHECK(phi::dynload::ncclGroupStart());
   }
 }
 
@@ -882,8 +878,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
   if (FLAGS_benchmark || FLAGS_benchmark_nccl) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
@@ -999,8 +993,6 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Point2Point(
   if (!is_batch_p2p && (FLAGS_benchmark || FLAGS_benchmark_nccl)) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.h b/paddle/fluid/distributed/collective/process_group_nccl.h
index 8a626d701b3245..22d90370f16afc 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.h
+++ b/paddle/fluid/distributed/collective/process_group_nccl.h
@@ -175,7 +175,7 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
 
   static void GroupEnd();
 
-  mcclComm_t NCCLComm(const Place& place) const;
+  ncclComm_t NCCLComm(const Place& place) const;
 
  private:
   std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(const Place& place,
diff --git a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
index 9061ce7aeaa068..eec697f5239450 100644
--- a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
+++ b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/backends/c_comm_lib.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -33,7 +33,7 @@ namespace detail {
 // In principle, the PHI Kernel cannot use the global singleton internally,
 // and the required members need to be passed in from the eucalyptus tree.
 ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
   paddle::distributed::ProcessGroup* pg = nullptr;
   if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
@@ -45,7 +45,7 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
   }
 #endif
   if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     return static_cast<paddle::distributed::ProcessGroupNCCL*>(pg)->NCCLComm(
         place);
 #else
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 591e083d005a44..6165dfc27e38ef 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -372,7 +372,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
       paddle::experimental::empty(IntArray({all_length_}), dtype_, place);
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto *default_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     ConcatTensorsWithType(
@@ -419,7 +419,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
 void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto &gpu_context = static_cast<const phi::GPUContext &>(context);
     SplitTensorsWithType(
         gpu_context, &dense_contents_, &dense_tensors_, dtype_);
@@ -1112,7 +1112,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
   auto *dev_ctx =
       platform::DeviceContextPool::Instance().Get(inner_place_);  // NOLINT
   if (platform::is_gpu_place(inner_place_)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     dev_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(inner_place_));
 #else
diff --git a/paddle/fluid/distributed/common/chunk_allocator.h b/paddle/fluid/distributed/common/chunk_allocator.h
index 17f7bb14224d35..7b19b3a1098398 100644
--- a/paddle/fluid/distributed/common/chunk_allocator.h
+++ b/paddle/fluid/distributed/common/chunk_allocator.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <glog/logging.h>
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace distributed {
@@ -77,9 +78,16 @@ class ChunkAllocator {
 
   void create_new_chunk() {
     Chunk* chunk;
-    posix_memalign(reinterpret_cast<void**>(&chunk),
-                   std::max<size_t>(sizeof(void*), alignof(Chunk)),
-                   sizeof(Chunk) + sizeof(Node) * _chunk_size);
+    size_t alloc_size = sizeof(Chunk) + sizeof(Node) * _chunk_size;
+    int error = posix_memalign(reinterpret_cast<void**>(&chunk),
+                               std::max<size_t>(sizeof(void*), alignof(Chunk)),
+                               alloc_size);
+    PADDLE_ENFORCE_EQ(error,
+                      0,
+                      paddle::platform::errors::ResourceExhausted(
+                          "Fail to alloc memory of %ld size, error code is %d.",
+                          alloc_size,
+                          error));
     chunk->next = _chunks;
     _chunks = chunk;
 
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index c896786c657f61..82a3514f2791f9 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -277,7 +277,7 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
   int64_t max_memory_size = framework::GetEagerDeletionThreshold();
   std::shared_ptr<framework::GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(place)) {
       if (framework::IsFastEagerDeletionModeEnabled()) {
         gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
index 61e0732f89f5bc..704dd16400065c 100644
--- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -71,7 +71,7 @@ bool CondInterceptor::GetCondResult() {
   const auto& cond_tensor = cond_var->Get<phi::DenseTensor>();
   bool res = false;
   if (platform::is_gpu_place(cond_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     phi::DenseTensor cpu_tensor;
     framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 0117a472ef06d3..a1fd38295319ed 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -76,7 +76,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
                 input_data.data.length());
   } else if (platform::is_gpu_place(place)) {
     VLOG(3) << "Loading data for GPU.";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
     auto gpu_place = place;
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index 6dc9cff9d9120b..b5786e23933930 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -51,7 +51,7 @@ void MessageBus::Init(
                           addr_));
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   // NOTE: To make the brpc is compatible with collective,
   // need release the handler holding the ip address.
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
index 2bd9213cae610d..47509d025722d8 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/multiply_fwd_func.cc
@@ -61,8 +61,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
   // Type promotion Logic
   if (phi::NeedTypePromotion(x.dtype(), y.dtype())) {
     VLOG(5) << "got different data type, run type protmotion automatically.";
-    LOG(WARNING) << "got different data type, run type protmotion "
-                    "automatically, this may cause data type been changed.";
+    LOG_FIRST_N(WARNING, 1)
+        << "got different data type, run type protmotion "
+           "automatically, this may cause data type been changed.";
     auto op_name = phi::TransToFluidOpName("multiply");
     auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype());
 
@@ -407,8 +408,9 @@ paddle::Tensor multiply_ad_func(const paddle::Tensor& x,
   // Type promotion Logic
   if (phi::NeedTypePromotion(x.dtype(), y.dtype())) {
     VLOG(5) << "got different data type, run type protmotion automatically.";
-    LOG(WARNING) << "got different data type, run type protmotion "
-                    "automatically, this may cause data type been changed.";
+    LOG_FIRST_N(WARNING, 1)
+        << "got different data type, run type protmotion "
+           "automatically, this may cause data type been changed.";
     auto op_name = phi::TransToFluidOpName("multiply");
     auto promotion_type = phi::GetPromoteDtype(op_name, x.dtype(), y.dtype());
 
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index bef2878e706f55..a6bb716e6b7ade 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -27,10 +27,6 @@ if(WITH_ROCM)
   target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
 endif()
 
-if(WITH_MUSA)
-  target_link_libraries(eager_generator ${MUSARTC_LIB})
-endif()
-
 if(WITH_CINN)
   target_link_libraries(eager_generator ${PYTHON_LIBRARIES})
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 2a96fddccbce70..75d6cb94c6b5f2 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -528,7 +528,7 @@ class {} : public egr::GradNodeBase {{
 
 TYPE_PROMOTION_LOGIC_TEMPLATE = """   if (phi::NeedTypePromotion({x}.dtype(), {y}.dtype())) {{
     VLOG(5) << "got different data type, run type protmotion automatically.";
-    LOG(WARNING) << "got different data type, run type protmotion automatically, this may cause data type been changed.";
+    LOG_FIRST_N(WARNING, 1) << "got different data type, run type protmotion automatically, this may cause data type been changed.";
     {op_name}
     auto promotion_type = phi::GetPromoteDtype(op_name, {x}.dtype(), {y}.dtype());
 
diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
index f93f41a21553a3..daf16f446ab12c 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -146,7 +146,7 @@ def FindParsingFunctionFromAttributeType(atype):
 FUNCTION_SET_DEVICE_TEMPLATE = """{}
     SetPythonStack();
     if (paddle::platform::is_gpu_place(place)) {{
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       phi::backends::gpu::SetDeviceId(place.device);
       VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
 #else
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index 2da9994b7671ce..a1e62ea6ba519b 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -103,7 +103,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 
     auto& place = dense_tensor->place();
     if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       paddle::framework::details::tensor_check<phi::GPUContext>(
           api_name, tensor_name, *dense_tensor, place);
 #else
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8aab6bf2a201ab..8aa03e98809fb2 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -213,11 +213,6 @@ elseif(WITH_ROCM)
     data_type_transform
     SRCS data_type_transform.cu
     DEPS tensor)
-elseif(WITH_MUSA)
-  musa_library(
-    data_type_transform
-    SRCS data_type_transform.cu
-    DEPS tensor)    
 elseif(WITH_XPU)
   cc_library(
     data_type_transform
@@ -466,7 +461,7 @@ if(WITH_PYTHON)
               ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if(WITH_GPU)
+    if(NOT WITH_ROCM)
       add_custom_target(
         fleet_executor_proto_init ALL
         DEPENDS fleet_proto_init fleet_executor_desc_py_proto
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index 6621b74740f250..1620c99ce8560d 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -45,19 +45,6 @@ class ConvSearchCache {
   AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetConvFusion() {
     return &fusion_forward_cache_;
   }
-#elif defined(PADDLE_WITH_MUSA)
-  // AlgorithmsCache<mudnnConvFwdAlgorithm_t>* GetForward() {
-  //   return &forward_cache_;
-  // }
-  // AlgorithmsCache<mudnnConvBwdDataAlgorithm_t>* GetBackwardData() {
-  //   return &backward_data_cache_;
-  // }
-  // AlgorithmsCache<mudnnConvBwdWeightsAlgorithm_t>* GetBackwardFilter() {
-  //   return &backward_filter_cache_;
-  // }
-  // AlgorithmsCache<mudnnConvFwdAlgorithm_t>* GetConvFusion() {
-  //   return &fusion_forward_cache_;
-  // }
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
     return &forward_cache_;
@@ -85,11 +72,6 @@ class ConvSearchCache {
   AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
   AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
   AlgorithmsCache<miopenConvFwdAlgorithm_t> fusion_forward_cache_;
-#elif defined(PADDLE_WITH_MUSA)
-  // AlgorithmsCache<mudnnConvFwdAlgorithm_t> forward_cache_;
-  // AlgorithmsCache<mudnnConvBwdDataAlgorithm_t> backward_data_cache_;
-  // AlgorithmsCache<mudnnConvBwdWeightsAlgorithm_t> backward_filter_cache_;
-  // AlgorithmsCache<mudnnConvFwdAlgorithm_t> fusion_forward_cache_;
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
   AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 4d2236ed1e66f7..bf2f9e4379b693 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -124,7 +124,7 @@ static void RunKernelFunc(
                 "Input tensor (%s) is not initialized.", in_name));
         paddle::Tensor custom_in;
         custom_in.set_impl(std::make_shared<phi::DenseTensor>(*x));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         if (custom_in.is_gpu_pinned()) {
           VLOG(3) << "Custom Operator: custom input is gpu pinned tensor";
           auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId());
@@ -936,7 +936,7 @@ static void RegisterOperatorKernel(
   }
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace());
 #endif
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index d3525c80d56db2..4a72f339a85cbc 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -271,8 +271,6 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
     cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_HIP)
     hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemcpy(dst, src, size, musaMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_XPU_KP)
     xpu_memcpy(dst, src, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
 #else
@@ -1531,7 +1529,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
 #endif
 }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 template <typename T>
 void PrivateInstantDataFeed<T>::PutToFeedVec() {
   for (size_t i = 0; i < use_slots_.size(); ++i) {
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
index 57cf488d2a3014..156c70b9825382 100644
--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -2982,7 +2982,7 @@ std::shared_ptr<phi::Allocation> GetNodeDegree(
 }
 
 int multi_node_sync_sample(int flag,
-                           const mcclRedOp_t &op,
+                           const ncclRedOp_t &op,
                            const paddle::platform::Place &place,
                            const int gpu_id,
                            phi::DenseTensor *multi_node_sync_stat_ptr) {
@@ -2998,8 +2998,8 @@ int multi_node_sync_sample(int flag,
   int *stat_ptr = multi_node_sync_stat_ptr->data<int>();
   auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId());
   auto stream = comm->stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
-      &stat_ptr[flag], &stat_ptr[3], 1, mcclInt, op, comm->comm(), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      &stat_ptr[flag], &stat_ptr[3], 1, ncclInt, op, comm->comm(), stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret,  // output
                                              &stat_ptr[3],
                                              sizeof(int),
@@ -3011,7 +3011,7 @@ int multi_node_sync_sample(int flag,
 }
 
 int get_multi_node_global_flag(int local_flag,
-                               const mcclRedOp_t &op,
+                               const ncclRedOp_t &op,
                                const paddle::platform::Place &place,
                                const int gpu_id,
                                cudaStream_t stream) {
@@ -3025,10 +3025,10 @@ int get_multi_node_global_flag(int local_flag,
       send_buff_ptr, &local_flag, sizeof(int), cudaMemcpyHostToDevice, stream);
   cudaStreamSynchronize(stream);
   auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0],
                                                               &send_buff_ptr[1],
                                                               1,
-                                                              mcclInt,
+                                                              ncclInt,
                                                               op,
                                                               comm->comm(),
                                                               stream));
@@ -3177,7 +3177,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
     // to decide whether to continue sampling
     if (FLAGS_enable_graph_multi_node_sampling) {
       switch_command = multi_node_sync_sample(
-          switch_flag, mcclProd, place, conf.gpuid, multi_node_sync_stat_ptr);
+          switch_flag, ncclProd, place, conf.gpuid, multi_node_sync_stat_ptr);
       VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync"
               << " switch_flag:" << switch_flag << "," << switch_command;
       if (switch_command) {
@@ -3187,7 +3187,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
       }
 
       sample_command = multi_node_sync_sample(
-          sample_flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
+          sample_flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
       VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync"
               << " sample_flag:" << sample_flag << "," << sample_command;
       if (sample_command == EVENT_FINISH_EPOCH) {
@@ -3280,7 +3280,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
     if (FLAGS_enable_graph_multi_node_sampling) {
       int flag = *jump_rows_ptr > 0 ? 1 : 0;
       int command = multi_node_sync_sample(
-          flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
+          flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
       VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync"
               << " step:" << step << " step_sample:" << flag << "," << command;
       if (command <= 0) {
@@ -3326,7 +3326,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
         // Step synchronization for multi-step sampling in multi node
         int flag = sample_res.total_sample_size > 0 ? 1 : 0;
         int command = multi_node_sync_sample(
-            flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
+            flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
         VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync"
                 << " step:" << step << " step_sample:" << flag << ","
                 << command;
@@ -3846,7 +3846,7 @@ void GraphDataGenerator::DoWalkandSage() {
     } else {
       if (conf_.sage_mode) {
         global_train_flag_ = get_multi_node_global_flag(
-            local_train_flag, mcclProd, place_, conf_.gpuid, sample_stream_);
+            local_train_flag, ncclProd, place_, conf_.gpuid, sample_stream_);
         VLOG(1) << "gpu_id: " << conf_.gpuid
                 << ", local_train_flag: " << local_train_flag
                 << ", global_train_flag: " << global_train_flag_;
@@ -4010,7 +4010,7 @@ void GraphDataGenerator::DoSageForTrain() {
       // check whether reach sage pass end
       if (conf_.is_multi_node) {
         int res = multi_node_sync_sample(sage_pass_end,
-                                         mcclProd,
+                                         ncclProd,
                                          place_,
                                          conf_.gpuid,
                                          &multi_node_sync_stat_);
@@ -4165,7 +4165,7 @@ void GraphDataGenerator::DoSageForInfer() {
       int local_pass_end = total_instance == 0;
       if (conf_.is_multi_node) {
         global_pass_end = get_multi_node_global_flag(
-            local_pass_end, mcclProd, place_, conf_.gpuid, sample_stream_);
+            local_pass_end, ncclProd, place_, conf_.gpuid, sample_stream_);
       } else {
         global_pass_end = local_pass_end;
       }
@@ -4261,11 +4261,11 @@ int dynamic_adjust_total_row_for_infer(int local_reach_end,
                   stream);
   cudaStreamSynchronize(stream);
   auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0],
                                                               &send_buff_ptr[1],
                                                               1,
-                                                              mcclInt,
-                                                              mcclProd,
+                                                              ncclInt,
+                                                              ncclProd,
                                                               comm->comm(),
                                                               stream));
   int global_reach_end = 0;
@@ -4356,7 +4356,7 @@ bool FillInferBuf(
           global_infer_node_type_start[infer_cursor] + conf.buf_size >=
           device_key_size;
       int global_reach_end = get_multi_node_global_flag(
-          local_reach_end, mcclProd, place, conf.gpuid, stream);
+          local_reach_end, ncclProd, place, conf.gpuid, stream);
       int remain = device_key_size - global_infer_node_type_start[infer_cursor];
       if (global_reach_end) {
         *total_row_ptr = remain;
@@ -5005,11 +5005,11 @@ int GraphDataGenerator::dynamic_adjust_batch_num_for_sage() {
   cudaStreamSynchronize(sample_stream_);
   auto comm =
       platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0],
                                                               &send_buff_ptr[1],
                                                               1,
-                                                              mcclInt,
-                                                              mcclMax,
+                                                              ncclInt,
+                                                              ncclMax,
                                                               comm->comm(),
                                                               sample_stream_));
   int thread_max_batch_num = 0;
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 492c7629abf9eb..243c5c818f5887 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -2023,7 +2023,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
   int pv_batch_size_;
 };
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 template <typename T>
 class PrivateInstantDataFeed : public DataFeed {
  public:
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index 010661fef6e8ab..88afa021b7c1b9 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -70,7 +70,7 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
 REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
 REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed);
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index b2fb089f535749..9d114fcf563963 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -101,7 +101,7 @@ struct CastDataType {
             in_end,
             out_begin,
             CastDataTypeFunctor<InType, OutType>());
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     } else if (platform::is_gpu_place(in_.place())) {
       phi::Transform<phi::GPUContext> trans;
       auto* context = static_cast<const phi::GPUContext*>(ctx_);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index f43c20a0d3a94c..f0c2b60f41b69d 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -221,75 +221,6 @@ elseif(WITH_ROCM)
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
     DEPS broadcast_op_handle)
-elseif(WITH_MUSA)
-  musa_library(
-    nan_inf_utils
-    SRCS nan_inf_utils_detail.cc
-    DEPS framework_proto scope place phi common)
-  musa_library(
-    all_reduce_op_handle
-    SRCS all_reduce_op_handle.cc
-    DEPS op_handle_base
-         scope
-         lod_tensor
-         phi
-         common
-         memory
-         dynload_cuda
-         variable_visitor)
-  musa_library(
-    fused_all_reduce_op_handle
-    SRCS fused_all_reduce_op_handle.cc
-    DEPS all_reduce_op_handle
-         op_handle_base
-         variable_visitor
-         scope
-         lod_tensor
-         phi
-         common
-         memory
-         dynload_cuda
-         place)
-  musa_library(
-    grad_merge_all_reduce_op_handle
-    SRCS grad_merge_all_reduce_op_handle.cc
-    DEPS fused_all_reduce_op_handle
-         op_handle_base
-         scope
-         lod_tensor
-         phi
-         common
-         memory
-         dynload_cuda
-         variable_visitor
-         place
-         all_reduce_op_handle)
-
-  if(WITH_DISTRIBUTE)
-    musa_library(
-      reduce_op_handle
-      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
-  else()
-    musa_library(
-      reduce_op_handle
-      SRCS reduce_op_handle.cc
-      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
-  endif()
-  musa_library(
-    broadcast_op_handle
-    SRCS broadcast_op_handle.cc
-    DEPS op_handle_base
-         scope
-         phi
-         common
-         memory
-         variable_visitor
-         dynload_cuda)
-  musa_library(
-    fused_broadcast_op_handle
-    SRCS fused_broadcast_op_handle.cc
-    DEPS broadcast_op_handle)    
 else()
   cc_library(
     nan_inf_utils
@@ -489,7 +420,7 @@ endif()
 
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
+   AND (WITH_GPU OR WITH_ROCM))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index 087a629d493444..b064a2aded0bcb 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -28,7 +28,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
@@ -207,17 +207,17 @@ void AllReduceOpHandle::AllReduceFunc(
     const std::vector<platform::Place> &places,
     const std::vector<std::string> &out_var_names) {
   if (platform::is_gpu_place(places[0])) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
                             platform::errors::InvalidArgument(
                                 "The nccl context should not be NULL."));
-    mcclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
+    ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
     std::vector<std::function<void()>> all_reduce_calls;
     for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
       auto &p = places[i];
       void *buffer = const_cast<void *>(lod_tensor_data.at(i));
       all_reduce_calls.emplace_back([=] {
-        NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, mcclSum);
+        NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum);
       });
     }
     NCCLAllReduceFunc(all_reduce_calls);
@@ -300,7 +300,7 @@ void AllReduceOpHandle::SyncBKCLAllReduce() {
 }
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void AllReduceOpHandle::NCCLAllReduceFunc(
     const std::vector<std::function<void()>> &all_reduce_calls) {
   this->RunAndRecordEvent([&] {
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index 0e2c06311bf385..685ab0b957a448 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -31,7 +31,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -43,7 +43,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class AllReduceOpHandle : public NCCLOpHandleBase {
  public:
   AllReduceOpHandle(ir::Node *node,
@@ -77,14 +77,14 @@ class AllReduceOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
 
-#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL) && \
+#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && \
     !defined(PADDLE_WITH_XPU_BKCL)
   // NCCLOpHandleBase and BKCLOpHandleBase already have these attributes.
   // Will polish it by class inheritance framework.
   std::vector<platform::Place> places_;
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   void NCCLAllReduceFunc(
       const std::vector<std::function<void()>> &all_reduce_calls);
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index 98672d09a2452e..b79eff24ee87d7 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -88,7 +88,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       });
     }
   } else if (platform::is_gpu_place(in_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     VarHandle *out_handle = nullptr;
     int root_id = in_tensor.place().device;  // NOLINT
     std::vector<std::function<void()>> broadcast_calls;
@@ -118,9 +118,9 @@ void BroadcastOpHandle::BroadcastOneVar(
       broadcast_calls.emplace_back(
           [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::mcclBcast(send_recv_buffer,
+                platform::dynload::ncclBcast(send_recv_buffer,
                                              numel,
-                                             static_cast<mcclDataType_t>(type),
+                                             static_cast<ncclDataType_t>(type),
                                              root_id,
                                              nccl_ctx.comm_,
                                              nccl_ctx.stream()));
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 3300c48b165853..9fbe2764913b55 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -34,7 +34,7 @@ class Node;
 }  // namespace ir
 }  // namespace framework
 namespace platform {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 struct NCCLContextMap;
 #endif
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -43,7 +43,7 @@ struct BKCLContextMap;
 }  // namespace platform
 }  // namespace paddle
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
@@ -55,7 +55,7 @@ namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   BroadcastOpHandle(ir::Node *node,
                     const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
@@ -109,7 +109,7 @@ struct BroadcastOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   const platform::NCCLContextMap *nccl_ctxs_;
 #elif defined(PADDLE_WITH_XPU_BKCL)
   const platform::BKCLContextMap *bkcl_ctxs_;
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 5b8857977c9fab..5a6f4e6e70d4c1 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -186,7 +186,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                         "fuse_relu_depthwise_conv_pass");
     AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
     AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
@@ -348,7 +348,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                 const std::string &loss_var_name,
                                 const std::vector<Scope *> &local_scopes,
                                 const size_t &nranks,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
                                 DeviceType use_device,
                                 platform::NCCLCommunicator *nccl_ctxs) const {
 #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
@@ -380,7 +380,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Erase(kNRanks);
       pass->Set<size_t>(kNRanks, new size_t(nranks));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       platform::NCCLCommunicator *nctx =
           (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
@@ -400,7 +400,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Erase(kLocalScopes);
       pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                     &local_scopes);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       platform::NCCLCommunicator *nctx =
           (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
@@ -428,7 +428,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
     } else if (pass->Type() == "all_reduce_deps_pass") {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       platform::NCCLCommunicator *nctx =
           (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
@@ -545,7 +545,7 @@ USE_PASS(fused_feedforward_pass);
 #ifdef PADDLE_WITH_DNNL
 USE_PASS(mkldnn_placement_pass);
 #endif
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 90cf7fe82ebfd2..203525d5a74821 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -217,7 +217,7 @@ struct BuildStrategy {
                    const std::string &loss_var_name,
                    const std::vector<Scope *> &local_scopes,
                    const size_t &nranks,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
                    DeviceType use_device,
                    platform::NCCLCommunicator *nccl_ctxs) const;
 #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 89d72a1b8213a5..4012263f688cb5 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include <algorithm>
@@ -44,7 +44,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       place_(place),
       var_infos_(vars.begin(), vars.end()),
       gc_(gc) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
@@ -53,9 +53,6 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaEventCreateWithFlags(&event_, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
@@ -78,14 +75,12 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (event_) {
     auto gpu_place = dev_ctx_->GetPlace();
     platform::CUDADeviceGuard guard(gpu_place.device);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -94,7 +89,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 }
 
 void EagerDeletionOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
@@ -182,7 +177,7 @@ void EagerDeletionOpHandle::RunImpl() {
 
 void EagerDeletionOpHandle::ClearGarbages(
     std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
@@ -192,10 +187,6 @@ void EagerDeletionOpHandle::ClearGarbages(
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamWaitEvent(callback_stream, event_, 0));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaStreamWaitEvent(callback_stream, event_, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -206,7 +197,7 @@ void EagerDeletionOpHandle::ClearGarbages(
   } else {
 #endif
     gc_->Add(std::move(*garbages));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 }
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 049b0c2ec478b4..0a92269c50ad2d 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -80,7 +80,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   phi::GPUContext *dev_ctx_{nullptr};
   gpuEvent_t event_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index be3b196c3ca6ca..ee78d366711075 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -135,7 +135,7 @@ static void TransData(const phi::DenseTensor *src_item,
                       const platform::DeviceContext &ctx) {
   if (src_item->IsInitialized() && src_item->numel() > 0) {
     if (platform::is_gpu_place(src_item->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 0ab7767aca0bac..27be4b77176350 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -121,7 +121,7 @@ static void TransData(const phi::DenseTensor &src_item,
                       phi::DenseTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
     if (platform::is_gpu_place(src_item.place())) {  // NOLINT
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index b1db6b334013d3..53746482d58a80 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -32,7 +32,7 @@ typedef std::vector<
     std::vector<std::pair<std::string, const phi::DenseTensor *>>>
     GradientAndLoDTensor;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
@@ -61,13 +61,11 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
 #endif
 
 FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
@@ -82,7 +80,7 @@ void FusedAllReduceOpHandle::RunImpl() {
       Name(), platform::TracerEventType::Communication, 1);
   VLOG(4) << this->DebugString();
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) {
     VLOG(10) << "FLAGS_allreduce_record_one_event=true";
     PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
@@ -105,9 +103,6 @@ void FusedAllReduceOpHandle::RunImpl() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(event, hipEventDisableTiming));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaEventCreateWithFlags(event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(event, cudaEventDisableTiming));
@@ -131,10 +126,6 @@ void FusedAllReduceOpHandle::RunImpl() {
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(nccl_stream, start_event_, 0));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaStreamWaitEvent(nccl_stream, start_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -194,16 +185,12 @@ void FusedAllReduceOpHandle::RunImpl() {
     FusedAllReduceFunc(in_var_handles, out_var_handles);
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (FLAGS_allreduce_record_one_event) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(compute_stream, end_event_, 0));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaStreamWaitEvent(compute_stream, end_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index a5c6c431f1742e..533d1d0860a553 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -33,7 +33,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -44,7 +44,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 struct FusedAllReduceOpHandle : public AllReduceOpHandle {
   FusedAllReduceOpHandle(ir::Node *node,
                          const std::vector<Scope *> &local_scopes,
@@ -75,7 +75,7 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle {
  private:
   size_t num_of_all_reduce_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   gpuEvent_t start_event_{nullptr};
   gpuEvent_t end_event_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index 198fb8b6eb07e6..6ba6df7011ade6 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -36,7 +36,7 @@ struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
@@ -46,7 +46,7 @@ namespace details {
 
 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   FusedBroadcastOpHandle(ir::Node *node,
                          const std::vector<Scope *> local_scopes,
                          const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
index 2ebaa31f53bd89..15648aa058f073 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
@@ -16,7 +16,7 @@
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -24,7 +24,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
@@ -77,7 +77,7 @@ std::string GradMergeAllReduceOpHandle::Name() const {
   return "grad_merge_all_reduce";
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
index 5e8d061762cbc8..ce01f85eaba52a 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
@@ -33,7 +33,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -44,7 +44,7 @@ namespace details {
 
 class GradMergeAllReduceOpHandle : public AllReduceOpHandle {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   GradMergeAllReduceOpHandle(ir::Node *node,
                              const std::vector<Scope *> &local_scopes,
                              const std::vector<platform::Place> &places,
@@ -75,7 +75,7 @@ class GradMergeAllReduceOpHandle : public AllReduceOpHandle {
 
 class FusedGradMergeAllReduceOpHandle : public FusedAllReduceOpHandle {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   FusedGradMergeAllReduceOpHandle(ir::Node *node,
                                   const std::vector<Scope *> &local_scopes,
                                   const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 91cb342594a635..6c3f5356ac1f15 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -183,7 +183,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index ab7c4ecd884683..e4472e8d989dd2 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -27,9 +27,6 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/fluid/platform/dynload/mccl.h"
-#endif
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/flags.h"
 
@@ -58,8 +55,6 @@ class NCCLOpHandleBase : public OpHandleBase {
     for (auto& ev : inter_events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -67,8 +62,6 @@ class NCCLOpHandleBase : public OpHandleBase {
     for (auto& ev : exter_events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -79,7 +72,7 @@ class NCCLOpHandleBase : public OpHandleBase {
     return nccl_ctxs_;
   }
 
-  mcclComm_t GetComm() const {
+  ncclComm_t GetComm() const {
     PADDLE_ENFORCE_EQ(
         places_.size(),
         1,
@@ -150,11 +143,6 @@ class NCCLOpHandleBase : public OpHandleBase {
           &inter_events_[dev_id], hipEventDisableTiming));
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
           &exter_events_[dev_id], hipEventDisableTiming));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags(
-          &inter_events_[dev_id], musaEventDisableTiming));
-      PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags(
-          &exter_events_[dev_id], musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(
           &inter_events_[dev_id], cudaEventDisableTiming));
@@ -171,8 +159,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                          const void* sendbuff,
                          void* recvbuff,
                          size_t count,
-                         mcclDataType_t datatype,
-                         mcclRedOp_t op) {
+                         ncclDataType_t datatype,
+                         ncclRedOp_t op) {
     PADDLE_ENFORCE_GE(
         run_order_,
         0,
@@ -188,7 +176,7 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dev_id:" << dev_id << ", dtype:" << datatype
              << ", place:" << place;
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
   }
 
@@ -196,8 +184,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                      const void* sendbuff,
                      void* recvbuff,
                      size_t count,
-                     mcclDataType_t datatype,
-                     mcclRedOp_t op) {
+                     ncclDataType_t datatype,
+                     ncclRedOp_t op) {
     PADDLE_ENFORCE_GE(
         run_order_,
         0,
@@ -215,8 +203,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                              const void* sendbuff,
                              void* recvbuff,
                              size_t count,
-                             mcclDataType_t datatype,
-                             mcclRedOp_t op) {
+                             ncclDataType_t datatype,
+                             ncclRedOp_t op) {
     PADDLE_ENFORCE_GE(
         run_order_,
         0,
@@ -236,8 +224,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                    const void* sendbuff,
                    void* recvbuff,
                    size_t count,
-                   mcclDataType_t datatype,
-                   mcclRedOp_t op UNUSED) {
+                   ncclDataType_t datatype,
+                   ncclRedOp_t op UNUSED) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
     int dev_id = place.device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
@@ -250,13 +238,11 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dtype:" << datatype << ", place:" << place
              << ", stream:" << stream;
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(
-        sendbuff, recvbuff, count, datatype, mcclSum, 0, comm, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+        sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
 
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(inter_events_.at(dev_id), stream);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventRecord(inter_events_.at(dev_id), stream);
 #else
     cudaEventRecord(inter_events_.at(dev_id), stream);
 #endif
@@ -270,8 +256,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                       const void* sendbuff,
                       void* recvbuff,
                       size_t count,
-                      mcclDataType_t datatype,
-                      mcclRedOp_t op) {
+                      ncclDataType_t datatype,
+                      ncclRedOp_t op) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
     PADDLE_ENFORCE_NOT_NULL(
         nccl_ctxs_,
@@ -290,21 +276,14 @@ class NCCLOpHandleBase : public OpHandleBase {
 #ifdef PADDLE_WITH_HIP
     hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     hipEventRecord(exter_events_.at(dev_id), stream);
-#elif defined(PADDLE_WITH_MUSA)
-    musaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
-        sendbuff, recvbuff, count, datatype, op, comm, stream));
-
-    musaEventRecord(exter_events_.at(dev_id), stream);
 #else
     cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     cudaEventRecord(exter_events_.at(dev_id), stream);
@@ -317,8 +296,8 @@ class NCCLOpHandleBase : public OpHandleBase {
   void InterBroadCast(platform::Place place,
                       void* sendbuff,
                       size_t count,
-                      mcclDataType_t datatype,
-                      mcclRedOp_t op UNUSED) {
+                      ncclDataType_t datatype,
+                      ncclRedOp_t op UNUSED) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
     int dev_id = place.device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
@@ -331,12 +310,10 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", stream:" << stream;
 #ifdef PADDLE_WITH_HIP
     hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
-#elif defined(PADDLE_WITH_MUSA)
-    musaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
 #else
     cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
 #endif
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         sendbuff, count, datatype, 0, comm, stream));
   }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 896b251571fc96..ee87141a9d5414 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,13 +31,11 @@ std::string OpHandleBase::DebugString() const {
 }
 
 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {  // NOLINT
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   for (auto &ev : events_) {
     if (ev.second) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -47,16 +45,13 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {  // NOLINT
 }
 
 void OpHandleBase::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   for (auto &p : dev_ctxes_) {
     int dev_id = p.first.device;  // NOLINT
     platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
@@ -141,7 +136,7 @@ void OpHandleBase::InitXPU() {
 }
 
 void OpHandleBase::Run(DeviceType use_device) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (events_.empty() && use_device == p::kCUDA && !dev_ctxes_.empty()) {
     InitCUDA();
   }
@@ -177,7 +172,7 @@ void OpHandleBase::Run(DeviceType use_device) {
 }
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PADDLE_ENFORCE_NOT_NULL(
       waited_ctx,
       platform::errors::InvalidArgument("Argument waited_ctx is NULL."));
@@ -193,8 +188,6 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
     for (auto &ev : events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
 #endif
@@ -228,15 +221,12 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
       if (in_var_handle) {
         auto &place = in_var_handle->place();
         if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
           auto stream =
               static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
 #ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
-#elif defined(PADDLE_WITH_MUSA)
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));        
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -258,7 +248,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (in_var_handle) {
           auto &place = in_var_handle->place();
           if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
@@ -283,16 +273,13 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
           auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
 #ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
-#elif defined(PADDLE_WITH_MUSA)
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));        
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -324,7 +311,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   callback();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = p.first.device;
@@ -333,9 +320,6 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));          
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
@@ -347,7 +331,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 4bd385ff5099cb..9afe56e4babd45 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -161,7 +161,7 @@ class OpHandleBase {
   // See https://github.com/PaddlePaddle/Paddle/pull/32283
   bool is_variant_scope_ = false;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::unordered_map<int, gpuEvent_t> events_;
 #endif
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index d7d0a3e2863638..fe43126ca8abe4 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -182,7 +182,7 @@ void ReduceOpHandle::RunImpl() {
         }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       auto pre_in = pre_in_var->Get<phi::DenseTensor>();
       VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
       VariableVisitor::GetMutableTensor(out_var).mutable_data(
@@ -210,12 +210,12 @@ void ReduceOpHandle::RunImpl() {
         size_t numel = static_cast<size_t>(lod_tensor.numel());
         all_reduce_calls.emplace_back(
             [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(
+              PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
                   buffer,
                   recvbuffer,
                   numel,
-                  static_cast<mcclDataType_t>(type),
-                  mcclSum,
+                  static_cast<ncclDataType_t>(type),
+                  ncclSum,
                   root_id,
                   nccl_ctx.comm_,
                   nccl_ctx.stream()));
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index eb0e319cce3b50..2eb0ad29232119 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -39,7 +39,7 @@ namespace platform {
 struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
@@ -79,7 +79,7 @@ struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   const platform::NCCLContextMap *nccl_ctxs_;
   ReduceOpHandle(ir::Node *node,
                  const std::vector<Scope *> &local_scopes,
@@ -129,7 +129,7 @@ struct ReduceOpHandle : public OpHandleBase {
 
   std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
 
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
     defined PADDLE_WITH_DISTRIBUTE
   template <typename DevCtx, typename DataType>
   void GatherSelectedRows(
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index f37ea73a477b66..8b486be9cc686a 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -76,7 +76,7 @@ struct ScaleLossGradFunctor {
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       OutT cast_coeff = static_cast<OutT>(coeff_);
       auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
       memory::Copy(place_,
@@ -110,7 +110,7 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
   auto *tensor = var->GetMutable<phi::DenseTensor>();
   tensor->Resize(common::make_ddim({1}));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   ScaleLossGradFunctor func(
       coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_));
   if (record_event) {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index cb16915316ecfe..02a68fb697efbb 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -95,7 +95,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index 5c266946144fe0..ba678bbe2e26be 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -196,7 +196,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
     auto comm = nccl_ctx.comm_;
 
     int encode_size = 2 * k * sizeof(int);
-    // dgc use mcclAllGather to get all the encoded data
+    // dgc use ncclAllGather to get all the encoded data
     // so the buffer need nranks.
     int buf_size = nranks_ * encode_size;
     void *gather_buff = gathers[i]->data();
@@ -207,10 +207,10 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     all_gather_calls.emplace_back([=] {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::mcclAllGather(in_tensor_buf,
+          platform::dynload::ncclAllGather(in_tensor_buf,
                                            gather_buff,
                                            2 * k,
-                                           static_cast<mcclDataType_t>(dtype),
+                                           static_cast<ncclDataType_t>(dtype),
                                            comm,
                                            stream));
     });
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index 9a130bea0d3a27..a6314220d5c264 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -129,7 +129,7 @@ struct VarHandle : public VarHandleBase {
         name_(std::move(name)),
         place_(std::move(place)) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   bool HasEvent() { return has_event_; }
 
   const gpuEvent_t& GetEvent() {
@@ -154,7 +154,7 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // Only when this event is triggered, var is generated.
   gpuEvent_t event_;
   bool has_event_{false};
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index e448f80ae39388..d7714808ff08ac 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -53,7 +53,7 @@ class Scope;
 }  // namespace framework
 }  // namespace paddle
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
@@ -85,12 +85,12 @@ class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
   virtual void Initialize(const TrainerDesc& param);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
+    defined(PADDLE_WITH_XPU)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
   }
@@ -155,7 +155,7 @@ class PullDenseWorker {
   float total_batch_num_ = 0;
   std::unordered_map<const Scope*, int> scope_to_thread_id_;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::vector<gpuStream_t> copy_streams_;
 #endif
   std::vector<paddle::platform::Place> places_;
@@ -186,7 +186,7 @@ class DeviceWorker {
   virtual void ProduceTasks() {}
   virtual void GetXpuOpIndex() {}
   virtual void Schedule(int taskid UNUSED) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   virtual void SetStream(const gpuStream_t stream UNUSED) {}
   virtual void SetEvent(const gpuEvent_t event UNUSED) {}
 #endif
@@ -588,7 +588,7 @@ class HeterCpuWorker : public HogwildWorker {
 };
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 class PSGPUWorker : public HogwildWorker {
@@ -604,7 +604,7 @@ class PSGPUWorker : public HogwildWorker {
     new (&program_) ProgramDesc(main_program);
   }
   void ProduceTasks() override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
 #endif
@@ -672,7 +672,7 @@ class PSGPUWorker : public HogwildWorker {
   std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuEvent_t event_;
   gpuStream_t copy_stream_;
 #endif
@@ -718,7 +718,7 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class SectionWorker : public DeviceWorker {
  public:
   SectionWorker() {}
@@ -845,7 +845,7 @@ class HeterSectionWorker : public DeviceWorker {
   Scope* GetThreadScope() override { return minibatch_scope_; }
 
   // multi-stream
-  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   //  void SetStream(const gpuStream_t stream) override {}
   //  void SetEvent(const gpuEvent_t event) override {}
   // #endif
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index c4ef22ebfe82cb..5c920fa3e318f9 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -77,13 +77,13 @@ REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker);
 REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL|| \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 4c6e19fd964bb1..1e1a02f944f65b 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -96,7 +96,7 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLDevice device;
     device.device_type = kDLGPU;
     device.device_id = place.device;  // NOLINT
@@ -108,7 +108,7 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ::DLDevice device;
     device.device_type = kDLCPUPinned;
     device.device_id = 0;
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 659bdcaaf95164..5dee8b04e78b7b 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -50,12 +50,6 @@ if(WITH_HETERPS)
       SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
       DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
     add_subdirectory(heter_ps)
-  elseif(WITH_MCCL)
-    musa_library(
-      ps_gpu_wrapper
-      SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
-      DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
-    add_subdirectory(heter_ps)    
   endif()
 else()
   cc_library(
@@ -64,7 +58,7 @@ else()
     DEPS gloo_wrapper)
 endif()
 
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     nccl_wrapper
     SRCS nccl_wrapper.cc
@@ -83,12 +77,6 @@ if(WITH_BOX_PS)
       SRCS box_wrapper.cc box_wrapper.cu
       DEPS framework_proto lod_tensor box_ps)
   endif()
-  if(WITH_MUSA)
-    musa_library(
-      box_wrapper
-      SRCS box_wrapper.cc box_wrapper.cu
-      DEPS framework_proto lod_tensor box_ps)
-  endif()  
 else()
   cc_library(
     box_wrapper
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 0d1c4aba87dc57..5f46906cf8e823 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -161,11 +161,6 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
             values.data(),
             values.size() * sizeof(float*),
             hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemcpy(gpu_values,
-            values.data(),
-            values.size() * sizeof(float*),
-            musaMemcpyHostToDevice);            
 #else
   cudaMemcpy(gpu_values,
              values.data(),
@@ -255,10 +250,6 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                      slot_num,
                      total_len);
   hipStreamSynchronize(stream);
-#elif defined(PADDLE_WITH_MUSA)
-  CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
-      origin_keys, total_keys, gpu_len, slot_num, total_len);
-  musaStreamSynchronize(stream);
 #else
   CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
       origin_keys, total_keys, gpu_len, slot_num, total_len);
@@ -304,19 +295,6 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
             slot_vector_.data(),
             slot_lengths_lod.size() * sizeof(int),
             hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemcpy(gpu_values,
-             grad_values.data(),
-             grad_values.size() * sizeof(float*),
-             musaMemcpyHostToDevice);
-  musaMemcpy(gpu_len,
-             slot_lengths_lod.data(),
-             slot_lengths.size() * sizeof(int64_t),
-             musaMemcpyHostToDevice);
-  musaMemcpy(d_slot_vector,
-             slot_vector_.data(),
-             slot_lengths_lod.size() * sizeof(int),
-             musaMemcpyHostToDevice);
 #else
   cudaMemcpy(gpu_values,
              grad_values.data(),
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index b3432277805a7e..9853c328cd14e9 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -595,9 +595,6 @@ class BoxWrapper {
       data->resize(len);
 #ifdef PADDLE_WITH_HIP
       hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-      musaMemcpy(
-          data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost);
 #else
       cudaMemcpy(
           data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost);
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index 9eb4360e7dd08d..d72e418aadd3ef 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -44,7 +44,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = place.GetDeviceId();
     phi::DenseTensor& total_keys_tensor = keys_tensor[device_id];
@@ -70,15 +70,6 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
               slot_lengths_lod.data(),
               slot_lengths.size() * sizeof(int64_t),
               hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemcpy(gpu_keys,
-              keys.data(),
-              keys.size() * sizeof(uint64_t*),
-              musaMemcpyHostToDevice);
-    musaMemcpy(gpu_len,
-              slot_lengths_lod.data(),
-              slot_lengths.size() * sizeof(int64_t),
-              musaMemcpyHostToDevice);              
 #else
     cudaMemcpy(gpu_keys,
                keys.data(),
@@ -162,7 +153,7 @@ void BoxWrapper::PushSparseGradCase(
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
     int device_id = place.GetDeviceId();
     phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys =
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 7ac9e4f7302a66..05433c1014656f 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -784,7 +784,7 @@ void FleetWrapper::PushDenseVarsSync(
     const uint64_t table_id,
     const std::vector<std::string>& var_names) {}
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
     (defined PADDLE_WITH_PSLIB)
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope,
@@ -816,9 +816,6 @@ void FleetWrapper::PushDenseVarsAsync(
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
     hipEventSynchronize(event);
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream));
-    musaEventSynchronize(event);
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
     cudaEventSynchronize(event);
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index 1284b379c9f20b..fb5cf917292566 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -175,7 +175,7 @@ class FleetWrapper {
 // Push dense variables to server in async mode
 // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
 // Param<out>: push_sparse_status
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void PushDenseVarsAsync(
       const Scope& scope,
       const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 1dbd675073dd7a..0af67107f0cbc6 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -96,18 +96,3 @@ if(WITH_ROCM)
     SRCS heter_ps.cu
     DEPS heter_comm)
 endif()
-if(WITH_MUSA)
-  musa_library(
-    heter_comm
-    SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h
-         hashtable.h
-    DEPS cub device_context)
-  musa_test(
-    test_heter_comm
-    SRCS feature_value.h
-    DEPS heter_comm)
-  musa_library(
-    heter_ps
-    SRCS heter_ps.cu
-    DEPS heter_comm)
-endif()
\ No newline at end of file
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index b5d788840ee547..3bf395071df274 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -846,7 +846,7 @@ void GraphGpuWrapper::init_service() {
     inter_comms_.resize(dev_size);
     if (gloo->Rank() == 0) {
       for (int i = 0; i < dev_size; ++i) {
-        platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]);
+        platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]);
       }
     }
 
@@ -860,13 +860,13 @@ void GraphGpuWrapper::init_service() {
     opts.setRoot(0);
     gloo::broadcast(opts);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
     for (int i = 0; i < dev_size; ++i) {
       platform::CUDADeviceGuard guard(device_id_mapping[i]);
       platform::dynload::ncclCommInitRank(
           &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank());
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
 
     rank_id_ = gloo->Rank();
     node_size_ = gloo->Size();
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index 4045c615a27cb3..315a9860ed67a2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 
 #ifdef PADDLE_WITH_HETERPS
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
@@ -302,9 +302,9 @@ class GraphGpuWrapper {
   int node_size_ = 1;
   int multi_node_ = 0;
 #ifdef PADDLE_WITH_CUDA
-  std::vector<mcclComm_t> inner_comms_;
-  std::vector<mcclComm_t> inter_comms_;
-  std::vector<mcclUniqueId> inter_ncclids_;
+  std::vector<ncclComm_t> inner_comms_;
+  std::vector<ncclComm_t> inter_comms_;
+  std::vector<ncclUniqueId> inter_ncclids_;
 #endif
 };  // class GraphGpuWrapper
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index b869ad1c235cb6..18e3966b220c0c 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -166,8 +166,8 @@ class HeterComm {
                         size_t len,
                         Sgd& sgd);  // NOLINT
 
-  void set_nccl_comm_and_size(const std::vector<mcclComm_t>& inner_comms,
-                              const std::vector<mcclComm_t>& inter_comms,
+  void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
+                              const std::vector<ncclComm_t>& inter_comms,
                               int comm_size,
                               int rank_id) {
     nccl_inner_comms_ = inner_comms;
@@ -791,8 +791,8 @@ class HeterComm {
 
 #if defined(PADDLE_WITH_CUDA)
   GpuRDMAChecker* rdma_checker_ = nullptr;
-  std::vector<mcclComm_t> nccl_inner_comms_;
-  std::vector<mcclComm_t> nccl_inter_comms_;
+  std::vector<ncclComm_t> nccl_inner_comms_;
+  std::vector<ncclComm_t> nccl_inter_comms_;
   int multi_mf_dim_{8};
   int max_mf_dim_ = 8;
   std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 3df6e6e89861ff..36fe556bcf3fbd 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -2870,7 +2870,7 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::send_data_by_all2all(
   auto &loc = storage_[gpu_id];
   auto nccl_stream = resource_->comm_stream(gpu_id, 0);
   size_t total_fea_num = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
   for (int i = 0; i < nccl_node_size; i++) {
     if (i == nccl_rank_id) {
       continue;
@@ -2881,7 +2881,7 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::send_data_by_all2all(
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclSend(&d_send_buff[send_offset],
                                       send_size * value_bytes,
-                                      mcclInt8,
+                                      ncclInt8,
                                       i,
                                       comm,
                                       nccl_stream));
@@ -2893,14 +2893,14 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::send_data_by_all2all(
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           reinterpret_cast<void *>(&d_rev_buff[recv_offset]),
           recv_size * value_bytes,
-          mcclInt8,
+          ncclInt8,
           i,
           comm,
           nccl_stream));
       total_fea_num += recv_size;
     }
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream));
 
   return total_fea_num;
@@ -2959,11 +2959,11 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::
   cache.node_barrier_.Resume();
   auto &comm = nccl_inter_comms_[gpu_id];
   auto nccl_stream = resource_->comm_stream(gpu_id, 0);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       &res.d_node_size_ptr[rank_offset],
       reinterpret_cast<void *>(res.d_node_size_ptr),
       node_size_,
-      mcclInt,
+      ncclInt,
       comm,
       nccl_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream));
@@ -3780,11 +3780,11 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::
   my_cache.node_barrier_.Resume();
   auto &comm = nccl_inter_comms_[gpu_id];
   auto nccl_stream = resource_->comm_stream(gpu_id, 0);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
       &res.d_node_size_ptr[rank_id_ * node_size_],
       reinterpret_cast<void *>(res.d_node_size_ptr),
       node_size_,
-      mcclInt,
+      ncclInt,
       comm,
       nccl_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream));
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 017e3726357b9a..3fe05753e09a31 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -134,8 +134,8 @@ void HeterPs<GPUAccessor, GPUOptimizer>::push_sparse(int num,
 
 template <typename GPUAccessor, template <typename T> class GPUOptimizer>
 void HeterPs<GPUAccessor, GPUOptimizer>::set_nccl_comm_and_size(
-    const std::vector<mcclComm_t>& inner_comms,
-    const std::vector<mcclComm_t>& inter_comms,
+    const std::vector<ncclComm_t>& inner_comms,
+    const std::vector<ncclComm_t>& inter_comms,
     int comm_size,
     int rank_id) {
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size, rank_id);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index d1c1d0c8b611bb..c472c2ed75a9d6 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -49,8 +49,8 @@ class HeterPs : public HeterPsBase {
                 size_t chunk_size,
                 int stream_num) override;
 #if defined(PADDLE_WITH_CUDA)
-  void set_nccl_comm_and_size(const std::vector<mcclComm_t>& inner_comms,
-                              const std::vector<mcclComm_t>& inter_comms,
+  void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
+                              const std::vector<ncclComm_t>& inter_comms,
                               int comm_size,
                               int rank_id) override;
   void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index b729cdfcbb0f96..8624425d8bfbd2 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -46,8 +46,8 @@ class HeterPsBase {
   virtual int get_index_by_devid(int devid) = 0;
 #if defined(PADDLE_WITH_CUDA)
   virtual void set_nccl_comm_and_size(
-      const std::vector<mcclComm_t>& inner_comms,
-      const std::vector<mcclComm_t>& inter_comms,
+      const std::vector<ncclComm_t>& inner_comms,
+      const std::vector<ncclComm_t>& inter_comms,
       int comm_size,
       int rank_id) = 0;
   virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index 97b704b4f3d219..a8ce9be92bdf68 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -121,7 +121,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
            tensor->numel() *
                SizeOfType(framework::TransToProtoVarType(tensor->dtype())));
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     memory::Copy(platform::CPUPlace(),
                  data_ptr,
                  tensor->place(),
@@ -141,7 +141,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
@@ -169,7 +169,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data = tensor->mutable_data(
       place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   memory::Copy(place,
                tensor_data,
                platform::CPUPlace(),
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 70cbce2acc24d7..77838fbec6d00e 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -92,7 +92,7 @@ class HeterWrapper {
 
   framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void DeSerializeToTensor(Scope* scope,
                            const VariableMessage& req_var,
                            platform::Place place,
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index 8be530c3170ba3..640f7dd08dc8d1 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -21,9 +21,9 @@ std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
 bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::mcclCommInitRank(&(nccl_info_.comm_),
+      platform::dynload::ncclCommInitRank(&(nccl_info_.comm_),
                                           nccl_info_.global_ranks_,
                                           nccl_info_.nccl_id_,
                                           nccl_info_.my_global_rank_));
@@ -32,16 +32,16 @@ void NCCLWrapper::InitNCCL() {
 }
 
 void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   nccl_info_.nccl_id_ = nccl_info.nccl_id_;
 #endif
   return;
 }
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::mcclGetUniqueId(&(nccl_info_.nccl_id_)));
+      platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
   return nccl_info_;
 }
@@ -49,15 +49,13 @@ NCCLInfo NCCLWrapper::GetNCCLId() {
 void NCCLWrapper::SetRankInfo(const int local_rank,
                               const int global_rank,
                               const int ranks) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   nccl_info_.local_rank_ = local_rank;
   nccl_info_.my_global_rank_ = global_rank;
   nccl_info_.global_ranks_ = ranks;
   platform::SetDeviceId(local_rank);
 #ifdef PADDLE_WITH_RCCL
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
-#elif defined(PADDLE_WITH_MCCL)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&(nccl_info_.stream_)));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
@@ -68,22 +66,20 @@ void NCCLWrapper::SetRankInfo(const int local_rank,
 void NCCLWrapper::SyncVar(const int root_rank,
                           const Scope& scope,
                           const std::vector<std::string>& var_names) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   for (auto& name : var_names) {
     auto var = scope.FindVar(name);
     phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
     int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
         reinterpret_cast<void*>(tensor->data<float>()),
         total_size,
-        mcclFloat,
+        ncclFloat,
         root_rank,
         nccl_info_.comm_,
         nccl_info_.stream_));
 #ifdef PADDLE_WITH_RCCL
     hipStreamSynchronize(nccl_info_.stream_);
-#elif defined(PADDLE_WITH_MCCL)
-    musaStreamSynchronize(nccl_info_.stream_);
 #else
     cudaStreamSynchronize(nccl_info_.stream_);
 #endif
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
index 46cdae20395e91..7e9cc0c56a6b46 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -31,10 +31,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
-#ifdef PADDLE_WITH_MCCL
-#include "paddle/fluid/platform/dynload/mccl.h"
-#endif
-
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
@@ -55,9 +51,9 @@ class NCCLInfo {
   int local_rank_;
   int global_ranks_;
   int my_global_rank_;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-  mcclUniqueId nccl_id_;
-  mcclComm_t comm_;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  ncclUniqueId nccl_id_;
+  ncclComm_t comm_;
   gpuStream_t stream_;
 #endif
 };
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index 85fe092e963db2..edfa4048b55287 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -314,7 +314,7 @@ class PSGPUWrapper {
         inter_comms_.resize(dev_size);
         if (gloo->Rank() == 0) {
           for (int i = 0; i < dev_size; ++i) {
-            platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]);
+            platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]);
           }
         }
 
@@ -328,13 +328,13 @@ class PSGPUWrapper {
         opts.setRoot(0);
         gloo::broadcast(opts);
 
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
         for (int i = 0; i < dev_size; ++i) {
           platform::CUDADeviceGuard guard(dev_ids[i]);
           platform::dynload::ncclCommInitRank(
               &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank());
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
 
         rank_id_ = gloo->Rank();
         node_size_ = gloo->Size();
@@ -979,9 +979,9 @@ class PSGPUWrapper {
   uint64_t table_id_;
   int gpu_graph_mode_ = 0;
 #ifdef PADDLE_WITH_CUDA
-  std::vector<mcclComm_t> inner_comms_;
-  std::vector<mcclComm_t> inter_comms_;
-  std::vector<mcclUniqueId> inter_ncclids_;
+  std::vector<ncclComm_t> inner_comms_;
+  std::vector<ncclComm_t> inter_comms_;
+  std::vector<ncclUniqueId> inter_ncclids_;
 #endif
   std::vector<int> heter_devices_;
   std::unordered_set<std::string> gpu_ps_config_keys_;
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index 5f9db8c20d51ff..d0620381ae8e91 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <functional>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/framework/garbage_collector.h"
@@ -64,7 +64,7 @@ void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {}
@@ -93,8 +93,6 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
   platform::CUDADeviceGuard guard(place.device);
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_));
   callback_manager_ =
@@ -203,7 +201,7 @@ std::unique_ptr<GarbageCollector> CreateGarbageCollector(
     const platform::Place &place, const size_t max_memory_size) {
   std::unique_ptr<GarbageCollector> gc = nullptr;
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (IsFastEagerDeletionModeEnabled()) {
       gc = std::make_unique<UnsafeFastGPUGarbageCollector>(place,
                                                            max_memory_size);
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index f9d94600a513d9..5376739624d6f3 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -85,7 +85,7 @@ class IPUGarbageCollector : public GarbageCollector {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
   UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index 83dbe31d86a5a8..b98094ab74101c 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
@@ -1202,20 +1202,20 @@ bool HogwildWorker::CheckBatchNum(int flag) {
     // comm_ctx->AllReduce only support allreduce on the whole tensor,
     // single element is not supported now.
     PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::mcclAllReduce(&stat_ptr[flag],
+        platform::dynload::ncclAllReduce(&stat_ptr[flag],
                                          &stat_ptr[2],
                                          1,
                                          ncclFloat32,
-                                         mcclProd,
+                                         ncclProd,
                                          comm_ctx->GetNcclComm(),
                                          stream));
 
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag],
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
                                                                 &stat_ptr[2],
                                                                 1,
                                                                 ncclFloat32,
-                                                                mcclProd,
+                                                                ncclProd,
                                                                 comm->comm(),
                                                                 stream));
   }
@@ -1246,11 +1246,11 @@ bool HogwildWorker::GetPassEnd(int flag) {
   //  auto stream = static_cast<phi::GPUContext *>(dev_ctx_)->stream();
   //  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   auto stream = comm->stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
                                                               &stat_ptr[2],
                                                               1,
                                                               ncclFloat32,
-                                                              mcclProd,
+                                                              ncclProd,
                                                               comm->comm(),
                                                               stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret,  // output
@@ -1267,7 +1267,7 @@ bool HogwildWorker::GetPassEnd(int flag) {
 void HogwildWorker::TrainFilesWithProfiler() {
   platform::SetNumThreads(1);
 #if defined(PADDLE_WITH_HETERPS) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
@@ -1473,7 +1473,7 @@ void HogwildWorker::TrainFiles() {
   platform::Timer timeline;
   timeline.Start();
 #if defined(PADDLE_WITH_HETERPS) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index d0c11c3098ddb2..46183fd93e97fd 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,7 +3,7 @@ add_subdirectory(memory_optimize_pass)
 add_subdirectory(multi_devices_graph_pass)
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
+   AND (WITH_GPU OR WITH_ROCM))
   add_subdirectory(fusion_group)
 endif()
 
@@ -169,7 +169,7 @@ if(WITH_TENSORRT)
   pass_library(trt_remove_amp_strategy_op_pass inference)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   pass_library(cudnn_placement_pass base DEPS placement_pass_base)
   pass_library(embedding_eltwise_layernorm_fuse_pass inference)
 endif()
@@ -493,7 +493,7 @@ cc_test(
   SRCS relu6_fuse_pass_test.cc
   DEPS relu6_fuse_pass)
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   cc_test(
     test_embedding_eltwise_layernorm_fuse_pass
     SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -543,7 +543,7 @@ if(WITH_MKLDNN)
       device_context
       phi
       common)
-  if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+  if(WITH_GPU OR WITH_ROCM)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
   endif()
   cc_test(
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
index a28930961efa0e..a54138060283bc 100644
--- a/paddle/fluid/framework/ir/cost_model.cc
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -128,7 +128,7 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs(
         main_thread_events[op_pop_index]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs(
         main_thread_events[op_pop_index]);
 #endif
@@ -152,7 +152,7 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs(
         main_thread_events[stop_profiler_idx]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs(
         main_thread_events[stop_profiler_idx]);
 #endif
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index e0a9502c685d25..048b33a649f94d 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -34,8 +34,8 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
   graph = FuseBatchNormAct(graph, act_types);
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index 36fa8a3331e7e1..2a24c5476a5010 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -25,8 +25,8 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
   graph = FuseBatchNormAddAct(graph, act_types);
diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index 390dd25b9cf5dd..570b081aae95ed 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(
   code_generator
   SRCS operation.cc code_generator.cc code_generator_helper.cc
   DEPS graph subgraph_detector)
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   cc_test(
     test_code_generator
     SRCS code_generator_tester.cc
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 92c1c1c6f02077..9749fb2bfa81c5 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -27,7 +27,7 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
index 232e9bbf43607f..195b29a9794a9a 100644
--- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
@@ -34,7 +34,7 @@ __device__ inline double Log(double x) { return log(x); }
 __device__ inline double Sqrt(double x) { return sqrt(x); }
 
 )";
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 static constexpr char predefined_cuda_functions_fp16[] = R"(
 __device__ inline __half Exp(const __half x) { return hexp(x); }
 __device__ inline __half Log(const __half x) { return hlog(x); }
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 17910d7dfae80b..30a001777bd587 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_utils.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -513,7 +513,7 @@ static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) {
 void ReplaceAllReduceOp(const Node &node,
                         proto::BlockDesc *block,
                         std::vector<OpDesc> *ops) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   bool is_fused = (node.Name() == "fused_all_reduce");
 
   details::OpHandleBase &op_handle =
@@ -688,7 +688,7 @@ static void GetGraphOpDesc(const std::vector<Node *> &nodes,
       ops->emplace_back();
       auto &desc = ops->back();
       ReplaceScaleLossGradOp(*n, &desc);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     } else if ((n->Name() == "allreduce" || n->Name() == "fused_all_reduce") &&
                dynamic_cast<details::NCCLOpHandleBase *>(
                    &(n->Wrapper<details::OpHandleBase>())) != nullptr) {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index c2a8c1bc73e8ea..9c60a665de0021 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -204,7 +204,7 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) {
            {});
 
   std::vector<bool> use_cuda_list{false};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   use_cuda_list.push_back(true);
 #endif
   for (auto use_cuda : use_cuda_list) {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
index 4579e172ef665e..0dcf316c33c696 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -30,7 +30,7 @@ class AllReduceDepsPass : public ir::Pass {
     std::vector<details::OpHandleBase*> all_reduce_op_handles =
         GetSortedAllReduceOps(*graph);
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto use_hierarchical_allreduce =
         Get<bool>(details::kUseHierarchicalAllReduce);
     for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index a24fd784bb4088..dc18979260f928 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -37,7 +37,7 @@ class FuseAllReduceOpPass : public ir::Pass {
     auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
     auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto *multi_nccl_ctxs =
         &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -95,7 +95,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       for (auto &p_g : group_p_g) {
         group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
       }
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       InsertFusedAllReduce(places,
                            local_scopes,
                            group_size,
@@ -177,7 +177,7 @@ class FuseAllReduceOpPass : public ir::Pass {
                             const std::vector<Scope *> &local_scopes,
                             const size_t num_of_all_reduce,
                             const std::vector<ir::Node *> &all_reduce_ops,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
                             const platform::NCCLCommunicator *multi_nccl_ctxs,
 #elif defined(PADDLE_WITH_XPU_BKCL)
                             const platform::BKCLCommunicator *multi_bkcl_ctxs,
@@ -244,7 +244,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       result->RemoveNode(op_handle.Node());
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     CreateFusedAllReduceOp(inputs,
                            outputs,
                            num_of_all_reduce,
@@ -285,7 +285,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       const std::vector<Scope *> &local_scopes,
       bool is_grad_merge,
       const std::string &grad_merge_cond_name,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       const platform::NCCLCommunicator *multi_nccl_ctxs,
 #elif defined(PADDLE_WITH_XPU_BKCL)
       const platform::BKCLCommunicator *multi_bkcl_ctxs,
@@ -293,7 +293,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       ir::Graph *result) const {
     details::FusedAllReduceOpHandle *op_handle = nullptr;
     if (is_grad_merge) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       op_handle = new details::FusedGradMergeAllReduceOpHandle(
           result->CreateEmptyNode("fused_all_reduce",
                                   ir::Node::Type::kOperation),
@@ -321,7 +321,7 @@ class FuseAllReduceOpPass : public ir::Pass {
           grad_merge_cond_name);
 #endif
     } else {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       op_handle = new details::FusedAllReduceOpHandle(
           result->CreateEmptyNode("fused_all_reduce",
                                   ir::Node::Type::kOperation),
@@ -355,7 +355,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       op_handle->AddOutput(out);
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     if (!multi_nccl_ctxs) {
       SetCommunicationContext(places, op_handle);
     }
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 9e7b22b8930cca..295ef57cfdfead 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -170,7 +170,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
   places_ = Get<const std::vector<platform::Place>>(details::kPlaces);
   local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
   strategy_ = Get<const details::BuildStrategy>(kStrategy);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
   nccl_ctxs_ = nullptr;
   if (multi_nccl_ctxs_) {
@@ -338,7 +338,7 @@ std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
 
 bool MultiDevSSAGraphBuilderBase::UseGPU() const {
   bool use_gpu = false;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
   return use_gpu;
@@ -389,7 +389,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
 
 void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
     details::OpHandleBase *op_handle, const platform::Place &p) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nccl_ctxs_ == nullptr) {
     op_handle->SetDeviceContext(p,
                                 platform::DeviceContextPool::Instance().Get(p));
@@ -408,7 +408,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
 void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
                                                     const std::string &p_name,
                                                     size_t src_dev_id) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto *op_handle = new details::BroadcastOpHandle(
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_,
@@ -453,7 +453,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
     ir::Graph *result,
     const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto *op_handle = new details::FusedBroadcastOpHandle(
       result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
       local_scopes_,
@@ -534,7 +534,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
       -> details::OpHandleBase * {
     if (is_encoded) {
 #if defined(PADDLE_WITH_DGC) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::SparseAllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -553,7 +553,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
       grad_merge_cond_name = PADDLE_GET_CONST(
           std::string, node->Op()->GetAttr(GRAD_MERGE_COND_NAME));
       VLOG(10) << "og=" << og << " use grad_merge_allreduce";
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::GradMergeAllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -578,7 +578,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
               grad_merge_cond_name));
 #endif
     } else {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::AllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -718,7 +718,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
 
 details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
     ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_,
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index 397922ad4bc88a..9e8fb5202a2d57 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -39,7 +39,7 @@ class Graph;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class NCCLCommunicator;
 class NCCLContextMap;
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -126,7 +126,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   void CreateIsolatedVarNode(ir::Graph *result, ir::Node *var_node) const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
   mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
 #elif defined(PADDLE_WITH_XPU_BKCL)
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index 976cd32e8ae515..debc3be7a32e00 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -34,7 +34,7 @@
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/pir/core/block_argument.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -105,7 +105,7 @@ platform::DeviceContext* ParseDeviceContext(
       return dev_ctx;
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
     // with use_cal_stream==false by returning a device context getting from the
     // global NCCLCommContext instance. Because when use_calc_stream==false, in
@@ -338,7 +338,7 @@ bool GetCondData(const phi::DenseTensor& cond) {
   // when platform::is_gpu_place(cond.place()) or
   // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   paddle::framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index a7434ad9d41819..8383b1fdd1790c 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -53,7 +53,7 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
     processor_count = static_cast<int>(std::thread::hardware_concurrency());
     if (processor_count) {
       if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         device_count = phi::backends::gpu::GetGPUDeviceCount();
 #endif
       }
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 491370d4198fbf..46b9247728d63e 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -749,7 +749,7 @@ void BuildOpFuncList(const platform::Place& place,
             *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context);
         auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType(
             op_with_kernel->GetExpectedKernelType(exec_ctx));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         if (op_with_kernel->CanCUDNNBeUsed(exec_ctx,
                                            expected_kernel_key.data_type_)) {
           expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index bc273000e626f5..5b60205fbc529f 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -229,7 +229,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
       return dev_ctx;
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
     // with use_cal_stream==false by returning a device context getting from the
     // global NCCLCommContext instance. Because when use_calc_stream==false, in
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index f6a5ed407c3f34..ff5832ba8335e6 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -48,7 +48,7 @@ PD_DECLARE_bool(benchmark);
 PHI_DECLARE_uint64(executor_log_deps_every_microseconds);
 PHI_DECLARE_bool(new_executor_use_cuda_graph);
 PHI_DECLARE_bool(enable_pir_in_executor);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -121,7 +121,7 @@ class InterpreterBaseImpl {
 inline void SetDeviceId(const platform::Place& place) {
   // TODO(zhiqiu): reduce the cost
   if (platform::is_gpu_place(place)) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
     PADDLE_THROW(platform::errors::Unavailable(
         "Cannot run operator on place %s, please recompile paddle or "
         "reinstall Paddle with CUDA support.",
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index ee7587140b9234..a336e2c377dfd1 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -314,7 +314,7 @@ void Instruction::AddInplace(Variable* in, Variable* out) {
 
 void Instruction::ClearInplace() { vec_inplace_in_to_out_.clear(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void Instruction::UpdataRecordStreamForGcInfo() {
   if (!IsInterpretercoreFastGCEnabled() ||
       KernelType() != OpFuncType::kGpuAsync) {
@@ -328,7 +328,7 @@ void Instruction::UpdataRecordStreamForGcInfo() {
   stream_ = reinterpret_cast<const phi::GPUContext&>(DeviceContext()).stream();
 // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now.
 // To support all the operators for communicating in the future.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto operator_base_ptr = OpBase();
   if ((operator_base_ptr->Type() == "send_v2") &&
       (operator_base_ptr->Attr<bool>("use_calc_stream") == false)) {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 6e96c0e5c109fa..66773746deb274 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -26,7 +26,7 @@
 #include "paddle/fluid/platform/event.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/utils/rw_lock.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -306,7 +306,7 @@ class Instruction {
   const OpFuncNode* OpFunc() const { return &op_func_node_; }
 
   // record stream for gc
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   bool need_record_stream_for_gc_ = false;
   gpuStream_t stream_{nullptr};
   void UpdataRecordStreamForGcInfo();
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index fe64b51464214c..66de40585130b5 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -64,7 +64,7 @@
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/dialect/control_flow/ir/cf_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -857,7 +857,7 @@ void PirInterpreter::RecordMemcpyD2H(InstructionBase* instr_node) {
 }
 
 void PirInterpreter::RecordStreamForGC(InstructionBase* instr) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   PADDLE_THROW(platform::errors::Unimplemented(
       "RecordStreamForGC is only implemented when compiled with GPU."));
 #else
@@ -876,7 +876,7 @@ void PirInterpreter::RecordStreamForGC(InstructionBase* instr) {
       reinterpret_cast<const phi::GPUContext&>(instr->DeviceContext()).stream();
 // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now.
 // To support all the operators for communicating in the future.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (instr->Name() == "pd_op.send_v2") {
     ::pir::Operation* op = instr->Operation();
     if (op->HasAttribute("use_calc_stream") &&
@@ -998,7 +998,7 @@ void PirInterpreter::CheckGC(InstructionBase* instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   RecordStreamForGC(instr);
 #endif
 
@@ -1619,7 +1619,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
 
       if (FLAGS_benchmark) {
         instr_node->DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
         VLOG(4) << "Operator(" << instr_node->Name()  // NOLINT
                 << "): context wait and get last error";
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index f2fa9fd50eedbb..95eee77d362883 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -42,7 +42,7 @@ class ProfilerGuard {
  private:
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto cuda_place = place;
       cost_info_->device_memory_bytes =
           platform::RecordedGpuMallocSize(cuda_place.device);
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index f0aefb94e6b691..d1ce9f55e46901 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -32,7 +32,7 @@
 #endif
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/device_manager.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
 
   PrepareForCUDAGraphCapture();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
 #endif
 }
@@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   start_time = calculate_stream_timer_->StartTime();
   end_time = calculate_stream_timer_->EndTime();
 #endif
@@ -701,7 +701,7 @@ void ProgramInterpreter::Convert(
 #endif
     vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     vec_instruction_.back().UpdataRecordStreamForGcInfo();
 #endif
   }
@@ -973,7 +973,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
         1,
         platform::EventRole::kInnerOp);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (is_in_op_profiling_mode_) {
       platform::GpuDeviceSync();
     }
@@ -1009,7 +1009,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
       OperatorDistAttr* op_dist_attr = block_.Op(op->Id())->MutableDistAttr();
       platform::Timer op_timer;
       op_timer.Start();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       platform::GpuDeviceSync();
 #endif
       op_timer.Pause();
@@ -1040,7 +1040,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()  // NOLINT
             << "): context wait and get last error";
@@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" &&
           !interpreter::IsCommunicationOp(instr_node)) {
@@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
     }
 
     instr_node.RecordEvent(place_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (instr_node.Id() == last_calculate_instr_id_ &&
           calculate_stream_timer_->IsStarted()) {
@@ -1320,7 +1320,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 }
 
 void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   PADDLE_THROW(platform::errors::Unimplemented(
       "RecordStreamForGC is only implemented when compiled with GPU."));
 #else
@@ -1428,7 +1428,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
 void ProgramInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (instr.need_record_stream_for_gc_) {
     RecordStreamForGC(instr);
   }
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 701da4f9473599..b19e3a06a42588 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #endif
 
@@ -234,7 +234,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   std::vector<HookFunc> output_hookfuncs_;
   std::vector<HookFunc> input_hookfuncs_;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::unique_ptr<phi::CalculateStreamTimer> calculate_stream_timer_;
 #endif
   size_t last_calculate_instr_id_;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index f4a5f6d410eae0..84ee045918fcd7 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -359,7 +359,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 #else
@@ -446,7 +446,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 318b00848bfedd..4ae5e0ebdf8720 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -58,7 +58,7 @@ class DenseTensor;
 #include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
@@ -771,7 +771,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   try {
     VLOG(4) << place << " " << DebugStringEx(&scope);
     if (platform::is_gpu_place(place)) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
       PADDLE_THROW(platform::errors::Unavailable(
           "Cannot run operator on place %s, please recompile paddle or "
           "reinstall Paddle with CUDA support.",
@@ -1539,7 +1539,7 @@ bool OperatorWithKernel::SupportsKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (this->CanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) {
     auto tmp_kernel_type = kernel_type;
     tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN;
@@ -1567,12 +1567,12 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx,
   bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr<bool>("use_cudnn") &&
                    paddle::platform::is_gpu_place(ctx.GetPlace());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (use_cudnn) {
     auto& dev_ctx = ctx.device_context<phi::GPUContext>();
     use_cudnn &= (dev_ctx.cudnn_handle() != nullptr);
   }
-#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP || defined(PADDLE_WITH_MUSA)
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
 
 #if defined(PADDLE_WITH_CUDA)
   if (use_cudnn && data_type == phi::DataType::BFLOAT16) {
@@ -1808,7 +1808,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) {
         kernel_type_->library_type_ = framework::LibraryType::kCUDNN;
       }
@@ -2071,7 +2071,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) || defined(PADLDE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
 #endif
     VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
@@ -2134,7 +2134,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) {
     expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
   }
@@ -2157,7 +2157,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       // CPUKernel will be executed and a warning will be given at the same
       // time.
       expected_kernel_key.place_ = platform::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (SupportGPU()) {
         auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index f8943d53f15909..d51c0ce0f415d0 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -584,7 +584,7 @@ class ExecutionContext : public phi::KernelContext {
     return device_context_;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   const inline phi::GPUContext& cuda_device_context() const {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
                       true,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index cef7e14a2a1b89..e6c11df275b569 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -41,14 +41,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
 PHI_DECLARE_double(eager_delete_tensor_gb);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -69,7 +69,7 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 std::once_flag p2p_init_flag;
 #endif
 
@@ -148,7 +148,7 @@ class ParallelExecutorPrivate {
     }
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
     VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
             << ", num_trainers:" << bst.num_trainers_
@@ -162,7 +162,7 @@ class ParallelExecutorPrivate {
               << bst.hierarchical_allreduce_exter_nranks_;
     }
 
-    std::vector<mcclUniqueId *> flat_nccl_ids;
+    std::vector<ncclUniqueId *> flat_nccl_ids;
     if (nranks_ == 1) {
       // FIXME(gongwb): need not to create ncclid when nranks==1
       nccl_ctxs_->InitFlatCtxs(
@@ -173,18 +173,18 @@ class ParallelExecutorPrivate {
     if (bst.enable_parallel_graph_) {
       VLOG(1) << "use only one ncclid in pg model";
 
-      mcclUniqueId *nccl_id = nullptr;
+      ncclUniqueId *nccl_id = nullptr;
 
       std::string var_name = platform::GetFlatNCCLVarName(0);
       auto nccl_id_var = scope->FindVar(var_name);
       if (nccl_id_var) {
-        nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
+        nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
         VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id;
       } else {
-        nccl_id = new mcclUniqueId();
+        nccl_id = new ncclUniqueId();
         PADDLE_ENFORCE_EQ(
-            platform::dynload::mcclGetUniqueId(nccl_id),
-            mcclSuccess,
+            platform::dynload::ncclGetUniqueId(nccl_id),
+            ncclSuccess,
             platform::errors::PreconditionNotMet(
                 "PaddlePaddle failed to get NCCL unique ID. It may due to your "
                 "system settings or NCCL library error, please debug on NCCL"));
@@ -213,7 +213,7 @@ class ParallelExecutorPrivate {
       PADDLE_ENFORCE_NOT_NULL(
           nccl_id_var,
           platform::errors::NotFound("Can't find nccl_id_var '%s'.", var_name));
-      auto nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
+      auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
       flat_nccl_ids.push_back(nccl_id);
     }
 
@@ -221,25 +221,25 @@ class ParallelExecutorPrivate {
         places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
 
     if (bst.use_hierarchical_allreduce_) {
-      std::vector<mcclUniqueId *> inter_nccl_ids;
+      std::vector<ncclUniqueId *> inter_nccl_ids;
       for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
         std::string var_name = platform::GetHierarchicalInterNCCLVarName(i);
         auto nccl_id_var = scope->FindVar(var_name);
         PADDLE_ENFORCE_NOT_NULL(nccl_id_var,
                                 platform::errors::NotFound(
                                     "Can't find nccl_id_var '%s'.", var_name));
-        auto inter_nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
+        auto inter_nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
         inter_nccl_ids.push_back(inter_nccl_id);
       }
 
-      std::vector<mcclUniqueId *> exter_nccl_ids;
+      std::vector<ncclUniqueId *> exter_nccl_ids;
       for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
         std::string var_name = platform::GetHierarchicalExterNCCLVarName(i);
         auto nccl_id_var = scope->FindVar(var_name);
         PADDLE_ENFORCE_NOT_NULL(nccl_id_var,
                                 platform::errors::NotFound(
                                     "Can't find nccl_id_var '%s'.", var_name));
-        auto nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
+        auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
         exter_nccl_ids.push_back(nccl_id);
       }
 
@@ -400,7 +400,7 @@ class ParallelExecutorPrivate {
 
   std::unordered_map<std::string, bool> is_persistable_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   platform::NCCLCommunicator *nccl_ctxs_{nullptr};
 #elif defined(PADDLE_WITH_XPU_BKCL)
   platform::BKCLCommunicator *bkcl_ctxs_{nullptr};
@@ -512,7 +512,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
     }
     std::unique_ptr<GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (IsFastEagerDeletionModeEnabled()) {
         gc = std::make_unique<UnsafeFastGPUGarbageCollector>(place,
                                                              max_memory_size);
@@ -623,7 +623,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
 }
 
 void InitP2P(const std::vector<platform::Place> &places) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::call_once(p2p_init_flag, [&]() {
     int count = places.size();
     if (count <= 1) return;
@@ -644,10 +644,6 @@ void InitP2P(const std::vector<platform::Place> &places) {
         hipError_t ret =
             hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
         if (ret != hipSuccess || can_acess != 1) {
-#elif defined(PADDLE_WITH_MUSA)
-        musaError_t ret =
-            musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
-        if (ret != musaSuccess || can_acess != 1) {
 #else
         cudaError_t ret =
             cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
@@ -659,8 +655,6 @@ void InitP2P(const std::vector<platform::Place> &places) {
           platform::CUDADeviceGuard guard(devices[i]);
 #ifdef PADDLE_WITH_HIP
           hipDeviceEnablePeerAccess(devices[j], 0);
-#elif defined(PADDLE_WITH_MUSA)
-          musaDeviceEnablePeerAccess(devices[j], 0);          
 #else
           cudaDeviceEnablePeerAccess(devices[j], 0);
 #endif
@@ -813,12 +807,12 @@ void ParallelExecutor::BCastParamsToDevices(
     }
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       std::vector<void *> buffers;
       buffers.reserve(member_->places_.size());
       size_t numel = main_tensor.numel();
       auto dtype = framework::TransToProtoVarType(main_tensor.dtype());
-      mcclDataType_t data_type = platform::ToNCCLDataType(dtype);
+      ncclDataType_t data_type = platform::ToNCCLDataType(dtype);
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
@@ -846,7 +840,7 @@ void ParallelExecutor::BCastParamsToDevices(
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
-          platform::dynload::mcclBcast(buffers[i],
+          platform::dynload::ncclBcast(buffers[i],
                                        numel,
                                        data_type,
                                        0,
@@ -1288,7 +1282,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
         BuildStrategy::ReduceStrategy::kAllReduce;
     member_->use_all_reduce_ = true;
   }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
         device_count,
@@ -1297,8 +1291,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
   }
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
-    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
         device_count,
@@ -1456,7 +1450,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
   }
 
   if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
 
     // Initialize device context's nccl comm, will be used by normal
@@ -1507,7 +1501,7 @@ std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
   std::vector<ir::Graph *> async_graphs(device_count);
 
   auto &graphs = *device_graphs;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (member_->build_strategy_.async_mode_) {
     PADDLE_ENFORCE_EQ(graphs.size(),
                       device_count,
@@ -1662,7 +1656,7 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
     final_graphs = *async_graphs;
   } else if (member_->build_strategy_.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     // TODO(Yancey1989): Remove passing in the main_program when
     // allreduce_seq_pass doesn't need it as the attr.
     bool is_inference = details::IsDataParallelInferenceGraph(*graph);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 48cd609d798e3d..32514089763c6e 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 0b77e80a0b4658..cc5cf54724dabe 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -134,7 +134,7 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (kernel_key.backend() == phi::Backend::GPU ||
       kernel_key.backend() == phi::Backend::GPUDNN) {
     PADDLE_THROW(
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index e37957918fe401..d1eb5558c54541 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 struct ConvertToPhiContext<phi::GPUContext> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 827e39c152640e..4566927e068ca6 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -34,7 +34,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   ParseDumpConfig(trainer_desc);
   const auto& section_config = section_params.section_config();
   int place_id = section_config.place_id();
-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)|| (defined PADDLE_WITH_MCCL)
+#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
   place_ = platform::CUDAPlace(place_id);
 #endif
   worker_ = DeviceWorkerFactory::CreateDeviceWorker(
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 472eb5ef9b42f8..4b629c24cf0e64 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL ||defined PADDLE_WITH_MCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index f1cc62bbfd3041..85fc30978f16a4 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/lodtensor_printer.h"
 #include "paddle/fluid/string/string_helper.h"
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 #ifdef PADDLE_WITH_CUDA
@@ -286,7 +286,7 @@ void PSGPUWorker::TrainFiles() {
   timeline.Start();
 
   int total_ins_num = 0;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
@@ -511,7 +511,7 @@ void PSGPUWorker::TrainFilesWithProfiler() {
   int total_ins_num = 0;
   int cur_batch;
   timeline.Start();
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index 8b740ea6156e20..f295fa7106dd43 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -69,11 +69,11 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   copy_streams_.clear();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
+    defined(PADDLE_WITH_XPU)
   places_.clear();
   thread_scopes_.clear();
 #endif
@@ -81,7 +81,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
 
 void PullDenseWorker::CreatePinVar() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
+    defined(PADDLE_WITH_XPU)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -95,7 +95,7 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       phi::DenseTensor* pin_tensor = ptr->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
 #endif
@@ -125,7 +125,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
   }
   status_vec->resize(0);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
+    defined(PADDLE_WITH_XPU)
 
   for (size_t i = 0; i < places_.size(); ++i) {
     // for (auto& v : dense_value_names_) {
@@ -141,7 +141,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
         float* w = tensor->data<float>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         memory::Copy(places_[i],
                      w,
                      platform::CUDAPinnedPlace(),
@@ -177,7 +177,7 @@ void PullDenseWorker::PullDense(bool force_update) {
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
+    defined(PADDLE_WITH_XPU)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_,
                                      tid,
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index 9f347ca4c01264..f88dbc409d1704 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <cfloat>
 
 #include "paddle/fluid/framework/device_worker.h"
@@ -228,7 +228,7 @@ void SectionWorker::TrainFiles() {
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc = std::make_unique<UnsafeFastGPUGarbageCollector>(place_,
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 01267fd059c1f7..27dc5902c75ba3 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -125,7 +125,7 @@ void TensorCopyImpl(const TENSOR& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -379,7 +379,7 @@ void TensorCopySync(const phi::DenseTensor& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -482,7 +482,7 @@ void TensorToStream(std::ostream& os,
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
       std::unique_ptr<char[]> buf(new char[kBufSize]);
       auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
@@ -616,7 +616,7 @@ void TensorFromStream(std::istream& is,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(common::make_ddim(shape));
@@ -690,7 +690,7 @@ void TensorFromStream(std::istream& is,
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_MUSA)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(common::make_ddim(dims));
       framework::VisitDataType(
@@ -812,7 +812,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
   if (dl_tensor.device.device_type == kDLCPU) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(dl_tensor.device.device_id);
@@ -852,7 +852,7 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
     void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (src->dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(src->dl_tensor.device.device_id);
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index c4d9b9c143009a..d9e3e384337366 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -129,7 +129,7 @@ void TensorFromArray(const T* src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -175,7 +175,7 @@ void TensorFromVector(const std::vector<T>& src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -304,7 +304,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -346,7 +346,7 @@ inline void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index 75268cb5aea275..af7fc63a2122a8 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -159,7 +159,7 @@ class DistMultiTrainer : public MultiTrainer {
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
 
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP  || defined(PADDLE_WITH_MUSA)|| \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
      defined PADDLE_WITH_XPU) &&                            \
     (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
 class HeterServiceContext {
@@ -175,7 +175,7 @@ class HeterServiceContext {
   int place_num_;
   Scope* scope_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuEvent_t event_;
 #endif
   std::vector<OperatorBase*> ops_;
@@ -207,7 +207,7 @@ class HeterXpuTrainer : public TrainerBase {
   virtual std::string GetDumpPath(int tid) { return ""; }
   virtual void InitDumpEnv() {}
   template <typename T>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void HeterMemCpy(phi::DenseTensor* tensor,
                    phi::DenseTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
@@ -245,7 +245,7 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::vector<gpuStream_t> copy_streams_;
   std::vector<gpuEvent_t> events_;
 #endif
@@ -253,7 +253,7 @@ class HeterXpuTrainer : public TrainerBase {
 
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 class PSGPUTrainer : public TrainerBase {
@@ -305,7 +305,7 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index aeb033649509fd..ba5dac4830aa18 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -72,17 +72,17 @@ REGISTER_TRAINER_CLASS(DistMultiTrainer);
 REGISTER_TRAINER_CLASS(HeterPipelineTrainer);
 #endif
 
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP  || defined PADDLE_WITH_MUSA || \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
      defined PADDLE_WITH_XPU) &&                            \
     (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index 42471cceb30252..c1f192673a7022 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -37,13 +37,6 @@
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#if defined(PADDLE_WITH_MCCL)
-#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
-#endif
-#endif
-
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 61790dc36e912e..9bffd125a3f3da 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -34,14 +34,6 @@
 #include <nccl.h>
 #endif
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <mudnn.h>
-#if defined(PADDLE_WITH_MCCL)
-#include <mccl.h>
-#endif
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <miopen/miopen.h>
 #ifdef PADDLE_WITH_RCCL
@@ -68,8 +60,8 @@ class SparseCsrTensor;
 namespace paddle {
 
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class Communicator;
 class NCCLCommunicator;
 #endif
@@ -198,13 +190,13 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     FetchList,
     FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-    mcclUniqueId,
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    ncclUniqueId,
     platform::Communicator,
     platform::NCCLCommunicator,
 #endif
-    // operators::CudnnRNNCache,
+    operators::CudnnRNNCache,
 #endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId,
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index ebf1fd4141ace0..b6d846e9a0c12d 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -97,7 +97,7 @@ cc_library(
   SRCS profiler.cc
   DEPS phi common)
 if(NOT WIN32)
-  if(WITH_NCCL OR WITH_RCCL  OR WITH_MCCL)
+  if(WITH_NCCL OR WITH_RCCL)
     cc_library(
       imperative_all_reduce
       SRCS all_reduce.cc
@@ -119,12 +119,6 @@ if(NOT WIN32)
         SRCS reducer.cc reducer.cu
         DEPS layer imperative_all_reduce)
     endif()
-    if(WITH_MCCL)
-      musa_library(
-        reducer
-        SRCS reducer.cc reducer.cu
-        DEPS layer imperative_all_reduce)
-    endif()    
   endif()
   if(WITH_XPU_BKCL)
     cc_library(
@@ -144,7 +138,6 @@ if(NOT WIN32)
     if(NOT
        (WITH_NCCL
         OR WITH_RCCL
-        OR WITH_MCCL
         OR WITH_XPU_BKCL
         OR WITH_GLOO))
       cc_library(
@@ -155,7 +148,6 @@ if(NOT WIN32)
   endif()
   if(WITH_NCCL
      OR WITH_RCCL
-     OR WITH_MCCL
      OR WITH_XPU_BKCL
      OR WITH_CUSTOM_DEVICE)
     cc_library(
@@ -177,7 +169,6 @@ if(WITH_GLOO)
      OR (NOT
          (WITH_NCCL
           OR WITH_RCCL
-          OR WITH_MCCL
           OR WITH_XPU_BKCL)
         ))
     cc_library(
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 5436364e56f7fd..c4bb42e4c22bb4 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 #include "paddle/fluid/imperative/all_reduce.h"
 
@@ -26,11 +26,6 @@
 #include <rccl.h>
 #endif
 
-#ifdef PADDLE_WITH_MCCL
-#include <mccl.h>
-#endif
-
-
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
@@ -74,16 +69,16 @@ static void AllReduce(const phi::DenseTensor &src,
   auto *dst_ptr = dst->mutable_data(src.place(), src.dtype());
   auto nccl_dtype =
       platform::ToNCCLDataType(framework::TransToProtoVarType(src.dtype()));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(src_ptr,
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(src_ptr,
                                                               dst_ptr,
                                                               src.numel(),
                                                               nccl_dtype,
-                                                              mcclSum,
+                                                              ncclSum,
                                                               comm->comm(),
                                                               stream));
 }
 
-// #if NCCL_VERSION_CODE >= 2212
+#if NCCL_VERSION_CODE >= 2212
 static void AllReduce(const phi::SelectedRows &src,
                       phi::SelectedRows *dst,
                       const ParallelStrategy &strategy,
@@ -106,7 +101,7 @@ static void AllReduce(const phi::SelectedRows &src,
   bool use_calc_stream = (dev_ctx->stream() == stream);
   VLOG(4) << "Is use calculate stream: " << use_calc_stream;
 
-  // 1. Gather rows number from all workers. Here use mcclAllGather to do this,
+  // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
   // but we can use other ways to implement is in the future
   const auto &src_rows = src.rows();
   phi::Vector<int64_t> rows_num_vector(strategy.nranks_);
@@ -119,10 +114,10 @@ static void AllReduce(const phi::SelectedRows &src,
     dev_ctx->Wait();
   }
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::mcclAllGather(gpu_rows_num_ptr + strategy.local_rank_,
+      platform::dynload::ncclAllGather(gpu_rows_num_ptr + strategy.local_rank_,
                                        gpu_rows_num_ptr,
                                        1,
-                                       mcclInt64,
+                                       ncclInt64,
                                        comm->comm(),
                                        stream));
 
@@ -168,14 +163,14 @@ static void AllReduce(const phi::SelectedRows &src,
     // allgather is used to speed up the allreduce by replacing broadcast.
     auto row_sendcount = cpu_rows_num_ptr[0];
     VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_rows_ptr,
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_rows_ptr,
                                                                 dst_rows_ptr,
                                                                 row_sendcount,
-                                                                mcclInt64,
+                                                                ncclInt64,
                                                                 comm->comm(),
                                                                 stream));
     auto value_sendcount = cpu_rows_num_ptr[0] * feature_size;
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_tensor_ptr,
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_tensor_ptr,
                                                                 dst_tensor_ptr,
                                                                 value_sendcount,
                                                                 nccl_dtype,
@@ -186,10 +181,10 @@ static void AllReduce(const phi::SelectedRows &src,
       if (cpu_rows_num_ptr[i] > 0) {
         // 2. Broadcast the rows of SelectedRows
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::mcclBroadcast(src_rows_ptr,
+            platform::dynload::ncclBroadcast(src_rows_ptr,
                                              dst_rows_ptr + row_offset,
                                              cpu_rows_num_ptr[i],
-                                             mcclInt64,
+                                             ncclInt64,
                                              i,
                                              comm->comm(),
                                              stream));
@@ -197,7 +192,7 @@ static void AllReduce(const phi::SelectedRows &src,
         auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
                                  row_offset * feature_size * sizeof_dtype;
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::mcclBroadcast(src_tensor_ptr,
+            platform::dynload::ncclBroadcast(src_tensor_ptr,
                                              dst_tensor_ptr_i,
                                              cpu_rows_num_ptr[i] * feature_size,
                                              nccl_dtype,
@@ -217,7 +212,7 @@ static void AllReduce(const phi::SelectedRows &src,
   VLOG(3) << "Result SelectedRows rows: "
           << string::join_strings(*dst_rows, ',');
 }
-// #endif
+#endif
 
 void AllReduce(const framework::Variable &src,
                framework::Variable *dst,
@@ -239,7 +234,7 @@ void AllReduce(const framework::Variable &src,
               dst->GetMutable<phi::DenseTensor>(),
               stream,
               comm);
-// #if NCCL_VERSION_CODE >= 2212
+#if NCCL_VERSION_CODE >= 2212
   } else if (src.IsType<phi::SelectedRows>()) {
     if (&src != dst) {
       if (!dst->IsType<phi::SelectedRows>()) {
@@ -262,7 +257,7 @@ void AllReduce(const framework::Variable &src,
       platform::GpuStreamSync(stream);
       *dst = std::move(tmp_dst);
     }
-// #endif
+#endif
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Unsupported variable type %s for imperative allreduce, only "
diff --git a/paddle/fluid/imperative/all_reduce.h b/paddle/fluid/imperative/all_reduce.h
index 049345772de65a..49e30549242052 100644
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index dfb231ead927ee..0c16a950358706 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -129,7 +129,7 @@ AmpOperators::AmpOperators()
       block_ops_(new std::unordered_set<std::string>()),
       unsupported_fp16_ops_(new std::unordered_set<std::string>()),
       unsupported_bf16_ops_(new std::unordered_set<std::string>()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto unsupported_ops_gpu_fp16 = std::get<2>(
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
   unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(),
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 58ecec47cccf39..4e0df45e840f25 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -141,7 +141,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
   const auto &src_tensor = src.value();
   const auto &place = src_tensor.place();
   auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
-  // 1. Gather rows number from all workers. Here use mcclAllGather to do this,
+  // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
   // but we can use other ways to implement is in the future
   auto &src_rows = src.rows();
   auto gloo_wrapper = framework::GlooWrapper::GetInstance();
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 61bb0a1d7c14e8..267540f0807413 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -209,7 +209,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     PADDLE_TENSOR_ADD(float, phi::GPUContext);
     PADDLE_TENSOR_ADD(double, phi::GPUContext);
     PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext);
@@ -326,7 +326,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     return;                                                              \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
@@ -334,7 +334,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 
@@ -381,7 +381,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     return;                                                            \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
@@ -389,7 +389,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 
@@ -447,7 +447,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     return dst_var;                                                  \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
@@ -463,7 +463,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 #if defined(PADDLE_WITH_XPU)
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 
@@ -734,7 +734,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -800,7 +800,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 13a3d356e61c5b..d70d40808f915d 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/imperative/nccl_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/imperative/all_reduce.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
@@ -41,10 +41,10 @@ class Variable;
 
 namespace paddle {
 namespace imperative {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 void NCCLParallelContext::BcastNCCLId(
-    std::vector<mcclUniqueId> &nccl_ids,  // NOLINT
+    std::vector<ncclUniqueId> &nccl_ids,  // NOLINT
     int root,
     int server_fd) {
   if (strategy_.local_rank_ == root) {
@@ -64,13 +64,13 @@ void NCCLParallelContext::BcastNCCLId(
 void NCCLParallelContext::Init() {
   int server_fd = -1;
 
-  std::vector<mcclUniqueId> nccl_ids;
+  std::vector<ncclUniqueId> nccl_ids;
   nccl_ids.resize(strategy_.nrings_);
 
   if (strategy_.local_rank_ == 0) {
     // generate the unique ncclid on the root worker
     for (auto &nccl_id : nccl_ids) {
-      platform::dynload::mcclGetUniqueId(&nccl_id);
+      platform::dynload::ncclGetUniqueId(&nccl_id);
     }
   } else {
     // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
@@ -101,12 +101,12 @@ void NCCLParallelContext::Init() {
 
 void NCCLParallelContext::InitWithRingID(int ring_id) {
   int server_fd = -1;
-  std::vector<mcclUniqueId> nccl_ids;
+  std::vector<ncclUniqueId> nccl_ids;
   nccl_ids.resize(1);
 
   if (strategy_.local_rank_ == 0) {
     // generate the unique ncclid on the root worker
-    platform::dynload::mcclGetUniqueId(&nccl_ids[0]);
+    platform::dynload::ncclGetUniqueId(&nccl_ids[0]);
   } else {
     // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
     // on rank0.
@@ -152,7 +152,7 @@ void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
   void *src_ptr = src_tensor->data();
   auto nccl_dtype = platform::ToNCCLDataType(
       framework::TransToProtoVarType(src_tensor->dtype()));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
       src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream));
 }
 
@@ -188,9 +188,6 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream));
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0));  
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
@@ -221,9 +218,6 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream));
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index f71c57af3f4f6d..7db96b2ee3d486 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 
@@ -29,10 +29,6 @@
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
 
-#ifdef PADDLE_WITH_MCCL
-#include "paddle/fluid/platform/dynload/mccl.h"
-#endif
-
 #include "paddle/fluid/imperative/parallel_context.h"
 
 namespace paddle {
@@ -44,7 +40,7 @@ class Variable;
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class NCCLParallelContext : public ParallelContext {
  public:
   explicit NCCLParallelContext(const ParallelStrategy& strategy,
@@ -53,7 +49,7 @@ class NCCLParallelContext : public ParallelContext {
 
   ~NCCLParallelContext() override = default;
 
-  void BcastNCCLId(std::vector<mcclUniqueId>& nccl_ids,
+  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids,
                    int root,  // NOLINT
                    int server_fd);
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 1545eb0bd6e68d..d336488a42327c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -205,7 +205,7 @@ PreparedOp PrepareImpl(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) {
     expected_kernel_key.set_backend(phi::Backend::GPUDNN);
   }
@@ -555,7 +555,7 @@ static void PreparedOpRunImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
@@ -645,7 +645,7 @@ static void PreparedOpRunPtImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index ef63b4a1b62d32..4bbc52662fc96e 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) ||     \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
 // div the nranks
@@ -40,7 +40,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
           : dense_contents_.GetMutable<phi::DenseTensor>();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     DivNRanks(tensor, nranks, context);
 #endif
   } else if (platform::is_cpu_place(tensor->place())) {
@@ -228,7 +228,7 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ConcatTensorsWithType(static_cast<const phi::GPUContext &>(context),
                           dense_tensors_,
                           &dense_contents_,
@@ -264,7 +264,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
 void Group::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     SplitTensorsWithType(static_cast<const phi::GPUContext &>(context),
                          &dense_contents_,
                          &dense_tensors_,
@@ -1020,7 +1020,7 @@ void Reducer::FinalizeBackward() {
 
   if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_GLOO)
     ProcessUnusedDenseVars();
 #endif
diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu
index 5d89f487bc379f..59b7ecf9154230 100644
--- a/paddle/fluid/imperative/reducer.cu
+++ b/paddle/fluid/imperative/reducer.cu
@@ -17,7 +17,7 @@
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void Group::DivNRanks(phi::DenseTensor *tensor,
                       int64_t nranks,
                       const platform::DeviceContext &context) {
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index 9a6e1de71fe9d2..011c8871329a55 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -44,7 +44,7 @@ class VariableWrapper;
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) ||     \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index d01fefc7795943..0f992c9b8be309 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -137,7 +137,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   if (gcs_.count(place) == 0) {
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gc = std::make_unique<framework::DefaultStreamGarbageCollector>(place, 0);
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -147,7 +147,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with GPU support."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gc = std::make_unique<framework::CUDAPinnedGarbageCollector>(place, 0);
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -309,7 +309,7 @@ void Tracer::TraceOpImpl(const std::string& type,
 
   try {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 3f4e7a9344a30c..d2f834a5938e96 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,7 +35,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(phi_modules GLOBAL PROPERTY PHI_MODULES)
 get_property(ir_targets GLOBAL PROPERTY IR_TARGETS)
 get_property(not_infer_modules GLOBAL PROPERTY NOT_INFER_MODULES)
-set(utils_modules pretty_log string_helper benchmark utf8proc)
+set(utils_modules pretty_log string_helper utf8proc)
 
 if(NOT WITH_GFLAGS)
   set(utils_modules ${utils_modules} paddle_flags)
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 302bc160c99387..221e6b7de1abfe 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -38,7 +38,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -215,7 +215,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       argument->scope_valid(),
       true,
       platform::errors::PreconditionNotMet("The scope field should be valid"));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (argument->use_gpu_valid()) {
     CopyParamsToGpu(argument);
   }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index 6ab7d83b8922d2..ee29af1c13308b 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -32,7 +32,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void CopyParamsToGpu(Argument *argument);
 #endif
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index d9d7d5aa3659ad..94e71f1cfddf16 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -32,7 +32,7 @@
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 #endif
 
@@ -100,7 +100,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id,
                                   Precision precision_mode) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   use_gpu_ = true;
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
@@ -180,6 +180,11 @@ void AnalysisConfig::EnableXpu(int l3_size,
                                bool transformer_encoder_adaptive_seqlen,
                                bool enable_multi_stream) {
 #if defined(PADDLE_WITH_XPU) || defined(LITE_SUBGRAPH_WITH_XPU)
+  LOG_FIRST_N(WARNING, 1)
+      << "Parameters in EnableXpu/enable_xpu is deprecated since version "
+         "2.6.1, and will be removed in version 3.0! Please use "
+         "EnableXpu/enable_xpu without parameters, and use "
+         "SetXpuConfig/set_xpu_config to set options.";
   use_xpu_ = true;
   xpu_config_.l3_size = l3_size;
   xpu_config_.conv_autotune_level = conv_autotune;
@@ -636,7 +641,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 }
 
 void AnalysisConfig::EnableCUDNN() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   use_cudnn_ = use_gpu_;
 #else
   LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
@@ -991,7 +996,7 @@ void AnalysisConfig::Update() {
   }
 
   if (use_gpu() && use_cudnn_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!enable_ir_optim_) {
       LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
     } else {
@@ -1207,7 +1212,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads(
 }
 
 float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // Get the GPU memory details and calculate the fraction of memory for the
   // GPU memory pool.
   size_t gpu_total, gpu_available;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index b8d95d712bdd82..476c78638c47fc 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -120,7 +120,7 @@ PHI_DECLARE_bool(pir_apply_inplace_pass);
 
 namespace paddle {
 namespace {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
                                 GPUContextResource *gpu_resource,
                                 Place place_) {
@@ -152,7 +152,7 @@ void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
   gpu_context->SetBlasTF32Handle(
       gpu_resource->GetBlasTF32TensorCoreHandleCreator());
   gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
-  // gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
+  gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
   gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
   gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
 
@@ -292,7 +292,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                       false,
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
     auto dst_gpu_place = place;
@@ -424,7 +424,7 @@ bool AnalysisPredictor::Init(
     return true;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // TODO(inference): Now only gpu with external stream support private
   // device_context.
   if (config_.use_gpu_ && config_.use_external_stream_) {
@@ -472,7 +472,7 @@ void AnalysisPredictor::InitPlace() {
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (config_.thread_local_stream_enabled()) {
       LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
                                  "Please use config.SetExecStream instead.";
@@ -543,14 +543,14 @@ void AnalysisPredictor::InitPlace() {
 }
 
 void AnalysisPredictor::InitResourceManager(void *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   predictor_stream_ =
       ResourceManager::Instance().InitGPUResource(place_, stream);
 #endif
 }
 
 void AnalysisPredictor::InitDeviceContexts() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // Init GPUContext.
   if (place_.GetType() == phi::AllocationType::GPU) {
     device_contexts_.emplace(
@@ -598,7 +598,7 @@ void AnalysisPredictor::InitDeviceContexts() {
 }
 
 void *AnalysisPredictor::GetExecStream() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (place_.GetType() == phi::AllocationType::GPU) {
     if (private_context_) {
       return predictor_stream_;
@@ -2315,7 +2315,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (!private_context_) {
     PADDLE_THROW(platform::errors::Fatal(
@@ -2326,8 +2326,6 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (stream != predictor_stream_) {
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
-#elif defined(PADDLE_WITH_MUSA)
-    musaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #else
     cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #endif
@@ -2367,13 +2365,11 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
     if (config_.use_gpu()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto *dev_ctx = pool.Get(place_);
       auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
 #ifdef PADDLE_WITH_HIP
       hipStreamSynchronize(stream);
-#elif defined(PADDLE_WITH_MUSA)
-      musaStreamSynchronize(stream);
 #else
       cudaStreamSynchronize(stream);
 #endif
@@ -2768,7 +2764,7 @@ AnalysisPredictor::~AnalysisPredictor() {  // NOLINT
   if (config_.shape_range_info_collected()) {
     StatisticShapeRangeInfo();
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (predictor_stream_ != nullptr) {
     ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
   }
@@ -3334,15 +3330,6 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
   return false;
 }
 
-bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
-                                          musaStream_t stream) {
-#ifdef PADDLE_WITH_MUSA
-  auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
-  return pred->ExpRunWithExternalStream(stream);
-#endif
-  return false;
-}
-
 bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,
                                          void *config) {
   auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 6725915a2c00c3..4a5cfb229a459e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -208,7 +208,7 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool ZeroCopyRun() override;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // Note: Can only be used under thread_local semantics.
   bool ExpRunWithExternalStream(const gpuStream_t stream);
 #endif
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 3c26f329d4747d..d886885edb5ba5 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -250,7 +250,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
           false,
           platform::errors::InvalidArgument(
               "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 530bc6f8a3eda7..eee3a707a03b14 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -110,7 +110,7 @@ T *Tensor::mutable_data(PlaceType place) {
       return tensor->mutable_data<T>(paddle::platform::CPUPlace());
     }
     case static_cast<int>(PlaceType::kGPU): {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       paddle::platform::CUDAPlace gpu_place(device_);
       auto *dev_ctxs = reinterpret_cast<const std::map<
           phi::Place,
@@ -208,7 +208,7 @@ void Tensor::CopyFromCpu(const T *data) {
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
     paddle::platform::CUDAPlace gpu_place(device_);
     auto *dev_ctxs = reinterpret_cast<const std::map<
@@ -424,7 +424,7 @@ void Tensor::CopyToCpuImpl(T *data,
         "with IPU."));
 #endif
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto gpu_place = t_place;
     auto *dev_ctxs = reinterpret_cast<const std::map<
         phi::Place,
@@ -440,17 +440,6 @@ void Tensor::CopyToCpuImpl(T *data,
                          dev_ctx->stream());
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(dev_ctx->stream());
-#elif defined(PADDLE_WITH_MUSA)
-    // async, return stream
-    if (nullptr != exec_stream) {
-      *(static_cast<musaStream_t *>(exec_stream)) = dev_ctx->stream();
-      // async with callback
-    } else if (cb) {
-      musaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
-      // sync
-    } else {
-      musaStreamSynchronize(dev_ctx->stream());
-    }
 #else
     // async, return stream
     if (nullptr != exec_stream) {
@@ -868,7 +857,7 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     paddle::platform::CUDAPlace gpu_place(t->device_);
     auto *t_data = tensor->mutable_data<T>(gpu_place);
     paddle::memory::Copy(gpu_place,
@@ -938,7 +927,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #endif
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data),
                          t_place,
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
index d0bad85bfdee13..7879adb57d86ef 100644
--- a/paddle/fluid/inference/api/infer_context.cc
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -22,7 +22,7 @@
 
 namespace paddle {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 InferGPUContext::InferGPUContext(const phi::Place& place)
     : phi::GPUContext(place, false) {}
 #endif
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index 518a85119ed792..216c7747f07065 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -26,7 +26,7 @@ class InferCPUContext : public phi::CPUContext {
   using phi::CPUContext::SetEigenDevice;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class InferGPUContext : public phi::GPUContext {
  public:
   explicit InferGPUContext(const phi::Place& place);
@@ -35,7 +35,7 @@ class InferGPUContext : public phi::GPUContext {
   using phi::GPUContext::SetBlasTF32Handle;
   using phi::GPUContext::SetDnnHandle;
   using phi::GPUContext::SetEigenDevice;
-  // using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSolverHandle;
   using phi::GPUContext::SetSparseHandle;
   using phi::GPUContext::SetStream;
   // using phi::GPUContext::SetDnnWorkspaceHandle;
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 6a3e943dec7e9a..b5a26ff9225aa4 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -111,6 +111,7 @@ struct PD_INFER_DECL XpuConfig {
   bool conv_autotune_file_writeback{false};
 
   // Fc autotune level. The Optional values are 0-9. Default 0 means no
+  // autotune.
   int fc_autotune_level{0};
   // Base fc autotune info is read from fc_autotune_file.
   std::string fc_autotune_file;
@@ -367,7 +368,7 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   void EnableXpu(int l3_size = 0xfffc00,
                  bool l3_locked = false,
-                 bool conv_autotune = true,
+                 bool conv_autotune = false,
                  const std::string& conv_autotune_file = "",
                  const std::string& transformer_encoder_precision = "int16",
                  bool transformer_encoder_adaptive_seqlen = false,
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 10e6d38e5a900d..3fefba9ef22be8 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -470,7 +470,6 @@ PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
 // forward declation
 using cudaStream_t = struct CUstream_st*;
 using hipStream_t = struct ihipStream_t*;
-using musaStream_t = struct MUstream_st*;
 
 namespace paddle_infer {
 class Predictor;
@@ -508,8 +507,6 @@ class PD_INFER_DECL InternalUtils {
                                     cudaStream_t stream);
   static bool RunWithExternalStream(paddle_infer::Predictor* pred,
                                     hipStream_t stream);
-  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
-                                    musaStream_t stream);                                    
   static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config);
 
   static void UpdateConfigInterleaved(paddle_infer::Config* c,
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 9aaa2184875dc7..4af87b029fd22f 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -16,10 +16,7 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include <mudnn.h>
-#endif
-#ifdef PADDLE_WITH_HIP 
+#ifdef PADDLE_WITH_HIP
 #include <miopen/miopen.h>
 #endif
 #ifdef PADDLE_WITH_TENSORRT
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 96676ff818c56c..2a8029555e94f5 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -44,7 +44,7 @@
 namespace paddle {
 namespace internal {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class EigenGpuStreamDevice : public Eigen::StreamInterface {
  public:
   EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -102,9 +102,6 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
@@ -135,7 +132,7 @@ void CPUContextResource::InitCPUResource() {
 
 CPUContextResource::CPUContextResource() { InitCPUResource(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 GPUContextResource::GPUContextResource(const phi::Place& place, void* stream)
     : place_(place) {
   InitGPUResource(stream);
@@ -161,8 +158,6 @@ void GPUContextResource::DestroyGPUResource() {
   if (owned_stream_) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
 #endif
@@ -171,8 +166,8 @@ void GPUContextResource::DestroyGPUResource() {
 
   DestroyDnnHandle();
   DestroyBlasHandle();
-  // DestroyBlasLtHandle();
-  // DestroySolverHandle();
+  DestroyBlasLtHandle();
+  DestroySolverHandle();
   DestroySparseHandle();
 }
 
@@ -210,21 +205,21 @@ void GPUContextResource::DestroyBlasHandle() {
   phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
 }
 
-// void GPUContextResource::InitBlasLtHandle() {
-//   phi::InitBlasLtHandle(&blaslt_handle_);
-// }
+void GPUContextResource::InitBlasLtHandle() {
+  phi::InitBlasLtHandle(&blaslt_handle_);
+}
 
-// void GPUContextResource::DestroyBlasLtHandle() {
-//   phi::DestroyBlasLtHandle(blaslt_handle_);
-// }
+void GPUContextResource::DestroyBlasLtHandle() {
+  phi::DestroyBlasLtHandle(blaslt_handle_);
+}
 
-// void GPUContextResource::InitSolverHandle() {
-//   phi::InitSolverHandle(&solver_handle_, stream_);
-// }
+void GPUContextResource::InitSolverHandle() {
+  phi::InitSolverHandle(&solver_handle_, stream_);
+}
 
-// void GPUContextResource::DestroySolverHandle() {
-//   phi::DestroySolverHandle(solver_handle_);
-// }
+void GPUContextResource::DestroySolverHandle() {
+  phi::DestroySolverHandle(solver_handle_);
+}
 
 void GPUContextResource::InitSparseHandle() {
   phi::InitSparseHandle(&sparse_handle_, stream_);
@@ -292,29 +287,29 @@ GPUContextResource::GetBlasTF32TensorCoreHandleCreator() {
   };
 }
 
-// blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
-//   return blaslt_handle_;
-// }
+blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
+  return blaslt_handle_;
+}
 
-// std::function<phi::blasLtHandle_t()>
-// GPUContextResource::GetBlasLtHandleCreator() {
-//   return [&]() {
-//     InitBlasLtHandle();
-//     return blaslt_handle_;
-//   };
-// }
+std::function<phi::blasLtHandle_t()>
+GPUContextResource::GetBlasLtHandleCreator() {
+  return [&]() {
+    InitBlasLtHandle();
+    return blaslt_handle_;
+  };
+}
 
-// phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
-//   return solver_handle_;
-// }
+phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
+  return solver_handle_;
+}
 
-// std::function<phi::solverHandle_t()>
-// GPUContextResource::GetSolverDnHandleCreator() {
-//   return [&]() {
-//     InitSolverHandle();
-//     return solver_handle_;
-//   };
-// }
+std::function<phi::solverHandle_t()>
+GPUContextResource::GetSolverDnHandleCreator() {
+  return [&]() {
+    InitSolverHandle();
+    return solver_handle_;
+  };
+}
 
 phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
   return sparse_handle_;
@@ -385,7 +380,7 @@ CPUContextResource* ResourceManager::GetCPUResource() const {
   return cpu_resource_.get();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
   std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
   if (gpu_resources_.count(stream)) {
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 96d534e8cc9540..1f4d4ea420e1b6 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -26,7 +26,7 @@
 #include "paddle/utils/test_macros.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -50,7 +50,7 @@ class CPUContextResource {
   std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUContextResource {
  public:
   explicit GPUContextResource(const phi::Place& place, void* stream);
@@ -61,8 +61,8 @@ class GPUContextResource {
   std::function<phi::blasHandle_t()> GetBlasHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTensorCoreHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
-  // std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
-  // std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
+  std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
+  std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
   std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
   std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();
 
@@ -71,8 +71,8 @@ class GPUContextResource {
   blasHandle_t GetBlasHandle() const;
   blasHandle_t GetBlasTensorCoreHandle() const;
   blasHandle_t GetBlasTF32Handle() const;
-  // blasLtHandle_t GetBlasLtHandle() const;
-  // phi::solverHandle_t GetSolverDnHandle() const;
+  blasLtHandle_t GetBlasLtHandle() const;
+  phi::solverHandle_t GetSolverDnHandle() const;
   phi::sparseHandle_t GetSparseHandle() const;
   Eigen::GpuDevice* GetGpuEigenDevice() const;
   int GetGpuComputeCapability() const;
@@ -91,10 +91,10 @@ class GPUContextResource {
   void InitDnnHanlde();
   void DestroyDnnHandle();
   void DestroyBlasHandle();
-  // void InitBlasLtHandle();
-  // void DestroyBlasLtHandle();
-  // void InitSolverHandle();
-  // void DestroySolverHandle();
+  void InitBlasLtHandle();
+  void DestroyBlasLtHandle();
+  void InitSolverHandle();
+  void DestroySolverHandle();
   void InitSparseHandle();
   void DestroySparseHandle();
 
@@ -117,9 +117,9 @@ class GPUContextResource {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
-  // blasLtHandle_t blaslt_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
-  // phi::solverHandle_t solver_handle_{nullptr};
+  phi::solverHandle_t solver_handle_{nullptr};
   phi::sparseHandle_t sparse_handle_{nullptr};
   // DnnWorkspaceHandle
 };
@@ -139,7 +139,7 @@ class ResourceManager {
   std::mutex cpu_mutex_;
   std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // GPU Resource
  public:
   void* InitGPUResource(const phi::Place& place, void* stream);
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index f3c953fb60a97e..9b36b6dc745e85 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -127,7 +127,7 @@ void MemoryCopyAsync(const platform::Place& dst_place,
   if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
     memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_cpu_place(dst_place) &&
         platform::is_gpu_place(src_place)) {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
old mode 100755
new mode 100644
index 8cf589541b1e04..10763eb911543a
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -47,6 +47,7 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
 #if IS_TRT_VERSION_GE(7000)
     teller_set.insert("tile");
+    int8_teller_set.insert("tile");
     teller_set.insert("flatten_contiguous_range");
     int8_teller_set.insert("flatten_contiguous_range");
     teller_set.insert("rnn");
@@ -2302,15 +2303,20 @@ struct SimpleOpTypeSetTeller : public Teller {
       if (!with_dynamic_shape) {
         if (tile_inputs.find("repeat_times_tensor") != tile_inputs.end()) {
           if (!desc.Input("repeat_times_tensor").empty()) {
+            VLOG(3) << "Tile op: repeat_times_tensor is not empty.";
             return false;
           }
         }
         if (tile_inputs.find("RepeatTimes") != tile_inputs.end()) {
           if (!desc.Input("RepeatTimes").empty()) {
+            VLOG(3) << "Tile op: RepeatTimes is not empty.";
             return false;
           }
         }
-        if (!desc.HasAttr("repeat_times")) return false;
+        if (!desc.HasAttr("repeat_times")) {
+          VLOG(3) << "Tile op:`repeat_times` is not set.";
+          return false;
+        }
       }
     }
 #endif
diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
index 6da8e874adc813..b3b0cd35fb300b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
@@ -19,7 +19,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/utils.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
@@ -30,13 +30,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 #if defined(PADDLE_WITH_NCCL)
-inline mcclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) {
+inline ncclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) {
   if (type == nvinfer1::DataType::kFLOAT) {
-    return mcclFloat;
+    return ncclFloat;
   } else if (type == nvinfer1::DataType::kHALF) {
-    return mcclFloat16;
+    return ncclFloat16;
   } else if (type == nvinfer1::DataType::kINT8) {
-    return mcclInt8;
+    return ncclInt8;
   } else if (type == nvinfer1::DataType::kINT32) {
     return ncclInt32;
   } else {
@@ -159,23 +159,23 @@ int CAllReducePluginDynamic::enqueue(
   auto input_type = input_desc[0].type;
   void* sendbuff = const_cast<void*>(inputs[0]);
   void* recvbuff = outputs[0];
-  mcclDataType_t dtype = NvInferDtypeToNCCLDType(input_type);
-  mcclRedOp_t nccl_red_type = mcclSum;
+  ncclDataType_t dtype = NvInferDtypeToNCCLDType(input_type);
+  ncclRedOp_t nccl_red_type = ncclSum;
   switch (red_type_) {
     case kRedSum:
-      nccl_red_type = mcclSum;
+      nccl_red_type = ncclSum;
       break;
 
     case kRedMax:
-      nccl_red_type = mcclMax;
+      nccl_red_type = ncclMax;
       break;
 
     case kRedMin:
-      nccl_red_type = mcclMin;
+      nccl_red_type = ncclMin;
       break;
 
     case kRedProd:
-      nccl_red_type = mcclProd;
+      nccl_red_type = ncclProd;
       break;
 
     default:
@@ -202,9 +202,9 @@ int CAllReducePluginDynamic::enqueue(
                           "NCCLCommContext is nullptr, collective op should "
                           "has ring_id attr."));
     auto stream = comm_ctx->GetStream();
-    mcclRedOp_t nccl_red_type = mcclSum;
+    ncclRedOp_t nccl_red_type = ncclSum;
     // comm_ctx->AllReduce(&inputs[0], inputs[0], nccl_red_type, stream);
-    phi::dynload::mcclAllReduce(sendbuff,
+    phi::dynload::ncclAllReduce(sendbuff,
                                 recvbuff,
                                 numel,
                                 dtype,
@@ -215,7 +215,7 @@ int CAllReducePluginDynamic::enqueue(
   } else {
     auto comm = platform::NCCLCommContext::Instance().Get(ring_id_);
     cudaStream_t custream = use_calc_stream_ ? stream : comm->stream();
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff,
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
                                                                 recvbuff,
                                                                 numel,
                                                                 dtype,
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index fec0a927b20e8b..298f54de48e8f3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -218,9 +218,6 @@ void QkvToContextPluginDynamic::configurePlugin(
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));          
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
index 3dbc06bfc11b7e..0ad2cb0e3f0c84 100644
--- a/paddle/fluid/inference/utils/CMakeLists.txt
+++ b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -1,8 +1,3 @@
-cc_library(
-  benchmark
-  SRCS benchmark.cc
-  DEPS enforce common)
-paddle_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
 cc_library(
   infer_io_utils
   SRCS io_utils.cc
@@ -13,13 +8,5 @@ cc_library(
   DEPS proto_desc enforce common)
 
 cc_library(table_printer SRCS table_printer.cc)
-paddle_test(test_table_printer SRCS table_printer_tester.cc)
 
 proto_library(shape_range_info_proto SRCS shape_range_info.proto)
-
-if(WITH_ONNXRUNTIME AND WIN32)
-  # Copy onnxruntime for some c++ test in Windows, since the test will
-  # be build only in CI, so suppose the generator in Windows is Ninja.
-  copy_onnx(test_benchmark)
-  copy_onnx(test_table_printer)
-endif()
diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc
deleted file mode 100644
index 24bc99ed183fad..00000000000000
--- a/paddle/fluid/inference/utils/benchmark.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/utils/benchmark.h"
-
-#include <fstream>
-
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace inference {
-
-std::string Benchmark::SerializeToString() const {
-  std::stringstream ss;
-  ss << "-----------------------------------------------------\n";
-  ss << "name\t";
-  ss << "batch_size\t";
-  ss << "num_threads\t";
-  ss << "latency\t";
-  ss << "qps";
-  ss << '\n';
-
-  ss << name_ << "\t";
-  ss << batch_size_ << "\t\t";
-  ss << num_threads_ << "\t";
-  ss << latency_ << "\t";
-  ss << 1000.0 / latency_;
-  ss << '\n';
-  return ss.str();
-}
-void Benchmark::PersistToFile(const std::string &path) const {
-  std::ofstream file(path, std::ios::app);
-  PADDLE_ENFORCE_EQ(
-      file.is_open(),
-      true,
-      platform::errors::Unavailable("Can not open %s to add benchmark.", path));
-  file << SerializeToString();
-  file.flush();
-  file.close();
-}
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h
deleted file mode 100644
index 56789843c3728e..00000000000000
--- a/paddle/fluid/inference/utils/benchmark.h
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <fstream>
-#include <iostream>
-#include <string>
-
-#include "paddle/utils/test_macros.h"
-
-namespace paddle {
-namespace inference {
-
-/*
- * Helper class to calculate the performance.
- */
-struct TEST_API Benchmark {
-  int batch_size() const { return batch_size_; }
-  void SetBatchSize(int x) { batch_size_ = x; }
-
-  int num_threads() const { return num_threads_; }
-  void SetNumThreads(int x) { num_threads_ = x; }
-
-  bool use_gpu() const { return use_gpu_; }
-  void SetUseGpu() { use_gpu_ = true; }
-
-  float latency() const { return latency_; }
-  void SetLatency(float x) { latency_ = x; }
-
-  const std::string& name() const { return name_; }
-  void SetName(const std::string& name) { name_ = name; }
-
-  std::string SerializeToString() const;
-  void PersistToFile(const std::string& path) const;
-
- private:
-  bool use_gpu_{false};
-  int batch_size_{0};
-  float latency_;
-  int num_threads_{1};
-  std::string name_;
-};
-
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc
deleted file mode 100644
index 8f7614cb10a44e..00000000000000
--- a/paddle/fluid/inference/utils/benchmark_tester.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/utils/benchmark.h"
-
-using namespace paddle::inference;  // NOLINT
-TEST(Benchmark, basic) {
-  Benchmark benchmark;
-  benchmark.SetName("key0");
-  benchmark.SetBatchSize(10);
-  benchmark.SetUseGpu();
-  benchmark.SetLatency(220);
-  LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString();
-}
-
-TEST(Benchmark, PersistToFile) {
-  Benchmark benchmark;
-  benchmark.SetName("key0");
-  benchmark.SetBatchSize(10);
-  benchmark.SetUseGpu();
-  benchmark.SetLatency(220);
-
-  benchmark.PersistToFile("1.log");
-  benchmark.PersistToFile("2.log");
-  benchmark.PersistToFile("3.log");
-}
diff --git a/paddle/fluid/inference/utils/table_printer_tester.cc b/paddle/fluid/inference/utils/table_printer_tester.cc
deleted file mode 100644
index fc482807b2854c..00000000000000
--- a/paddle/fluid/inference/utils/table_printer_tester.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include "paddle/fluid/inference/utils/table_printer.h"
-
-namespace paddle {
-namespace inference {}  // namespace inference
-}  // namespace paddle
-
-TEST(table_printer, output) {
-  std::vector<std::string> header{"config", "value"};
-  paddle::inference::TablePrinter table(header);
-
-  // model_dir
-  table.InsertRow({"model_dir", "./model_dir"});
-  // model
-  table.InsertRow({"model_file", "./model.pdmodel"});
-  table.InsertRow({"params_file", "./model.pdiparams"});
-
-  table.InsetDivider();
-  // gpu
-  table.InsertRow({"use_gpu", "true"});
-  table.InsertRow({"gpu_device_id", "0"});
-  table.InsertRow({"memory_pool_init_size", "100MB"});
-  table.InsertRow({"thread_local_stream", "false"});
-  table.InsetDivider();
-
-  // trt precision
-  table.InsertRow({"use_trt", "true"});
-  table.InsertRow({"trt_precision", "fp32"});
-  table.InsertRow({"enable_dynamic_shape", "true"});
-  table.InsertRow({"DisableTensorRtOPs", "{}"});
-  table.InsertRow({"EnableVarseqlen", "ON"});
-  table.InsertRow({"tensorrt_dla_enabled", "ON"});
-  table.InsetDivider();
-
-  // lite
-  table.InsertRow({"use_lite", "ON"});
-  table.InsetDivider();
-
-  // xpu
-  table.InsertRow({"use_xpu", "true"});
-  table.InsertRow({"xpu_device_id", "0"});
-  table.InsetDivider();
-
-  // ir
-  table.InsertRow({"ir_optim", "true"});
-  table.InsertRow({"ir_debug", "false"});
-  table.InsertRow({"enable_memory_optim", "false"});
-  table.InsertRow({"EnableProfile", "false"});
-  table.InsertRow({"glog_info_disabled", "false"});
-  table.InsetDivider();
-
-  // cpu
-  table.InsertRow({"CpuMathLibrary", "4"});
-  // mkldnn
-  table.InsertRow({"enable_mkldnn", "false"});
-  table.InsertRow({"mkldnn_cache_capacity", "10"});
-
-  // a long string
-  table.InsertRow(
-      {"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ a long string "
-       "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~",
-       "------------------------------------------ a long value "
-       "-----------------------------------------------------"});
-
-  LOG(INFO) << table.PrintTable();
-}
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index aed5d674e49ff5..5b49d927ae6762 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -62,17 +62,6 @@ if(WITH_ROCM)
     DEPS malloc gpu_info place)
 endif()
 
-if(WITH_MUSA)
-  musa_test(
-    malloc_test
-    SRCS malloc_test.cu
-    DEPS device_context malloc)
-  musa_test(
-    cuda_managed_memory_test
-    SRCS cuda_managed_memory_test.cu
-    DEPS malloc gpu_info place)
-endif()
-
 if(WITH_TESTING AND TEST cuda_managed_memory_test)
   set_tests_properties(
     cuda_managed_memory_test
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index eae17991ff2fe5..ffce57d78f1642 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -19,7 +19,7 @@ set(ALLOCATOR_SRCS
     buddy_allocator.cc
     system_allocator.cc)
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   list(
     APPEND
     ALLOCATOR_SRCS
@@ -90,13 +90,6 @@ if(WITH_ROCM)
     SRCS thread_local_allocator_test.cc
     DEPS allocator)
 endif()
-if(WITH_MUSA)
-  musa_test(
-    thread_local_allocator_test
-    SRCS thread_local_allocator_test.cc
-    DEPS allocator)
-endif()
-
 
 if(WITH_GPU)
   nv_test(
@@ -108,11 +101,6 @@ elseif(WITH_ROCM)
     best_fit_allocator_test
     SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
     DEPS allocator memcpy)
-elseif(WITH_MUSA)
-  musa_test(
-    best_fit_allocator_test
-    SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
-    DEPS allocator memcpy)
 else()
   cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS
               allocator)
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index 17839ecf0caecc..dd86ba9855fbab 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -26,9 +26,9 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 
-#ifdef PADDLE_WITH_MCCL
-#include <mccl.h>
-#include "paddle/fluid/platform/dynload/mccl.h"
+#ifdef PADDLE_WITH_NCCL
+#include <nccl.h>
+#include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 
 PHI_DECLARE_string(allocator_strategy);
@@ -144,22 +144,22 @@ using DecoratedAllocationPtr =
 
 template <typename T>
 static T&& FillValue(T&& allocation) {
-#if defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA)
   if (allocation != nullptr) {
     if (FLAGS_sync_after_alloc || FLAGS_alloc_fill_value >= 0) {
-      PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
       if (FLAGS_alloc_fill_value >= 0) {
         VLOG(10) << "Set " << FLAGS_alloc_fill_value << " on "
                  << allocation->ptr() << " " << allocation->place() << " "
                  << allocation->size();
         if (platform::is_gpu_place(allocation->place())) {
-          PADDLE_ENFORCE_GPU_SUCCESS(musaMemset(
+          PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(
               allocation->ptr(), FLAGS_alloc_fill_value, allocation->size()));
         } else {
           std::memset(
               allocation->ptr(), FLAGS_alloc_fill_value, allocation->size());
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+        PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
       }
     }
   }
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index e7df0f7213363f..59ab4eaf154724 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include <shared_mutex>
 
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -165,7 +165,7 @@ class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   using CUDAAllocatorMap =
       std::map<platform::CUDAPlace,
                std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
@@ -193,7 +193,7 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -219,7 +219,7 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -294,7 +294,7 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -353,7 +353,7 @@ class AllocatorFacadePrivate {
            LIKELY(FLAGS_use_system_allocator == false);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
     auto it = cuda_allocators_.find(place);
     if (it == cuda_allocators_.end()) {
@@ -730,7 +730,7 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void InitNaiveBestFitCUDAPinnedAllocator() {
     if (FLAGS_use_auto_growth_pinned_allocator) {
       auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
@@ -804,7 +804,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = CreateCUDAAllocator(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
@@ -890,7 +890,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP)
     auto cuda_allocator = CreateCUDAAllocator(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
@@ -1252,7 +1252,7 @@ class AllocatorFacadePrivate {
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
     int device_count = platform::GetGPUDeviceCount();
@@ -1276,7 +1276,7 @@ class AllocatorFacadePrivate {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
@@ -1322,7 +1322,7 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe(allocators_);
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (is_stream_safe_cuda_allocator_used_) {
       CheckCUDAAllocThreadSafe(cuda_allocators_);
     }
@@ -1355,7 +1355,7 @@ class AllocatorFacadePrivate {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
       default_stream_safe_cuda_allocators_;
@@ -1489,7 +1489,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
     }
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   AllocatorFacadePrivate* m = GetPrivate();
   if (!m->IsStreamSafeCUDAAllocatorUsed()) {
     VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
@@ -1515,7 +1515,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 bool AllocatorFacade::InSameStream(
     const std::shared_ptr<phi::Allocation>& allocation,
     const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
   return s == GetStream(allocation);
 #else
@@ -1527,7 +1527,7 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
   return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   gpuStream_t stream) {
   AllocatorFacadePrivate* m = GetPrivate();
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 39819e0d66bdc9..acfd73a411932f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -81,7 +81,7 @@ class AllocatorFacade {
 
   bool IsStreamSafeCUDAAllocatorUsed();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
   uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
   void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
index 0f532d1fff4d78..4f08db4921f8ba 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define USE_DEVICE
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
@@ -54,7 +54,7 @@ BuddyAllocator::BuddyAllocator(
     };
     use_custom_device_ = true;
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     init_allocate_size_func_ = &platform::GpuInitAllocSize;
     re_allocate_size_func_ = &platform::GpuReallocSize;
 #endif
@@ -279,7 +279,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   allocate_bytes = DeviceAllocateSize(
       init_allocate_size_func_, re_allocate_size_func_, request_bytes);
 #else
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   allocate_bytes = DeviceAllocateSize(
       &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
 #endif
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 3f50fa9651ced2..781addd7dba60b 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -23,10 +23,6 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#endif
-
 #include <string>
 
 #include "paddle/fluid/platform/cuda_device_guard.h"
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 139e2358d161c8..7286f84160c6ad 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -82,9 +82,6 @@ class GPUContextAllocator : public Allocator {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaEventCreateWithFlags(&event_, musaEventDisableTiming));        
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreate(&event_, cudaEventDisableTiming));
@@ -95,9 +92,8 @@ class GPUContextAllocator : public Allocator {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
+
       PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_));         
 #else
       PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -117,9 +113,6 @@ class GPUContextAllocator : public Allocator {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_));
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0));
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
index 331fe723d32bb9..77ca495cacbc70 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -19,11 +19,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index c8ac552bf1b73a..d39cb285517f2c 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -26,7 +26,7 @@
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/common/place.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
@@ -213,7 +213,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 }
 
 // For CUDA
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUBuddyAllocatorList {
  private:
   GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
@@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
 
 template <>
 size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
   return GetGPUBuddyAllocator(place.device)->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -294,7 +294,7 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 template <>
 void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                  size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
@@ -315,8 +315,6 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     if (FLAGS_init_allocated_mem) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(ptr, 0xEF, size);
-#elif defined(PADDLE_WITH_MUSA)
-      musaMemset(ptr, 0xEF, size);
 #else
       cudaMemset(ptr, 0xEF, size);
 #endif
@@ -333,7 +331,7 @@ template <>
 void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                void *p,
                                size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   GetGPUBuddyAllocator(place.device)->Free(p);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -343,7 +341,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
 
 template <>
 uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return GetGPUBuddyAllocator(place.device)->Release();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -351,7 +349,7 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
   static BuddyAllocator *ba = nullptr;
@@ -369,7 +367,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
 
 template <>
 size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return GetCUDAPinnedBuddyAllocator()->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -380,7 +378,7 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
 template <>
 void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                        size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
   void *ptr = buddy_allocator->Alloc(size);
@@ -402,7 +400,7 @@ template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                      void *p,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -414,7 +412,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 template <>
 uint64_t Release<platform::CUDAPinnedPlace>(
     const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   VLOG(10) << "Release on " << platform::Place(place);
   return GetCUDAPinnedBuddyAllocator()->Release();
 #else
@@ -605,7 +603,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return Used(gpu);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -614,7 +612,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return Used(cuda_pinned);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 206ad954468010..32853f08f94e5a 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -23,8 +23,6 @@ bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr()));  
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
@@ -40,8 +38,6 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
-#elif defined(PADDLE_WITH_MUSA)  
-  PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 30fe2d9b095eb7..48b18f07456c66 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -92,17 +92,6 @@ bool StreamSafeCUDAAllocation::CanBeFreed() {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(err);
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
-    
-#elif defined(PADDLE_WITH_MUSA)
-    gpuError_t err = musaEventQuery(event);
-    if (err == musaErrorNotReady) {
-      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
-      // Erase the completded event before "it"
-      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
-      return false;
-    }
-    PADDLE_ENFORCE_GPU_SUCCESS(err);
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));    
 #else
     gpuError_t err = hipEventQuery(event);
     if (err == hipErrorNotReady) {
@@ -139,9 +128,6 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
-#elif defined (PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaEventCreateWithFlags(&new_event, musaEventDisableTiming));        
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
@@ -156,8 +142,6 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
-#elif defined (PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 79a7c7abf01de2..31508a10799228 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -24,9 +24,6 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
-#elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
-#include <musa.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index cb9c4afd7b9fcf..e9a9fcbff9831e 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
@@ -120,7 +120,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
@@ -216,8 +216,6 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 // PINNED memory is visible to all CUDA contexts.
 #ifdef PADDLE_WITH_HIP
   hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable);
-#elif defined(PADDLE_WITH_MUSA)  
-  musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable);
 #else
   cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
 #endif
@@ -261,22 +259,6 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
         platform::errors::Fatal(
             "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
   }
-#elif defined(PADDLE_WITH_MUSA)
-  err = musaFreeHost(p);
-
-  // Purposefully allow cudaErrorCudartUnloading, because
-  // that is returned if you ever call cudaFreeHost after the
-  // driver has already shutdown. This happens only if the
-  // process is terminating, in which case we don't care if
-  // cudaFreeHost succeeds.
-  if (err != musaErrorMusartUnloading) {
-    PADDLE_ENFORCE_EQ(
-        err,
-        0,
-        platform::errors::Fatal(
-            "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
-            err));
-  }
 #else
   err = cudaFreeHost(p);
 
diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
index b2cce04a04d37e..67376a3e39a224 100644
--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -43,7 +43,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUAllocator : public SystemAllocator {
  public:
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 63504621f98c5b..0c40da19d47e5f 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -57,7 +57,7 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
   return allocation::AllocatorFacade::Instance().GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 48fbc541e5fa91..3b098e5a13e515 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -49,7 +49,7 @@ extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
 
 void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index c8ce60e7c39d6e..bffbcbdfad76bc 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/utils/test_macros.h"
 
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
@@ -110,11 +111,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
 #endif  // PADDLE_WITH_CUSTOM_DEVICE
 
 template <>
-void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
-                                                  void* dst,
-                                                  platform::CPUPlace,
-                                                  const void* src,
-                                                  size_t num) {
+TEST_API void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
+                                                           void* dst,
+                                                           platform::CPUPlace,
+                                                           const void* src,
+                                                           size_t num) {
   if (UNLIKELY(num == 0)) return;
   VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
@@ -256,8 +257,7 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
 #ifdef PADDLE_WITH_HIP
@@ -272,22 +272,10 @@ inline void SyncCUDAStream() {
   }
 #endif
 }
-#elif defined(PADDLE_WITH_MUSA)
-inline void SyncCUDAStream() {
-#if !defined(_WIN32)
-  musaStreamSynchronize(0);
-#else
-  musaError_t e_sync = musaSuccess;
-  while (e_sync = musaStreamQuery(0)) {
-    if (e_sync == musaErrorNotReady) continue;
-    break;
-  }
-#endif
-}
 #else
 inline void SyncCUDAStream() {
 #if !defined(_WIN32)
-  cudaStreamSynchronize(0);
+  cudaStreamSynchronize(nullptr);
 #else
   cudaError_t e_sync = cudaSuccess;
   while (e_sync = cudaStreamQuery(0)) {
@@ -305,7 +293,7 @@ inline void SyncCUDAStream() {
 // https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
 
 template <>
-void Copy<platform::CPUPlace, platform::CUDAPlace>(
+TEST_API void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place,
     void* dst,
     platform::CUDAPlace src_place,
@@ -326,12 +314,6 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpyAsync(dst,
-                             src,
-                             num,
-                             musaMemcpyDeviceToHost,
-                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -344,8 +326,6 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
         "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -357,7 +337,7 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
 }
 
 template <>
-void Copy<platform::CUDAPlace, platform::CPUPlace>(
+TEST_API void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place,
     void* dst,
     platform::CPUPlace src_place,
@@ -378,12 +358,6 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpyAsync(dst,
-                             src,
-                             num,
-                             musaMemcpyHostToDevice,
-                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -396,8 +370,6 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
         "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -432,12 +404,6 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                num,
                                hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
-#elif defined(PADDLE_WITH_MUSA)
-      platform::GpuMemcpyAsync(dst,
-                               src,
-                               num,
-                               musaMemcpyDeviceToDevice,
-                               reinterpret_cast<gpuStream_t>(stream));
 #else
       platform::GpuMemcpyAsync(dst,
                                src,
@@ -451,8 +417,6 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                          1);
 #ifdef PADDLE_WITH_HIP
       platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-      platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice);
 #else
       platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
 #endif
@@ -492,7 +456,7 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
 }
 
 template <>
-void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+TEST_API void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
     platform::CUDAPinnedPlace dst_place,
     void* dst,
     platform::CPUPlace src_place,
@@ -528,7 +492,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
   if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
+          << dst_place << " by stream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned",
                                        platform::TracerEventType::UserDefined,
@@ -539,12 +503,6 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpyAsync(dst,
-                             src,
-                             num,
-                             musaMemcpyDeviceToHost,
-                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -558,8 +516,6 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                                        1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -578,7 +534,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 
   platform::SetDeviceId(dst_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
+          << dst_place << " by stream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU",
                                        platform::TracerEventType::UserDefined,
@@ -589,12 +545,6 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpyAsync(dst,
-                             src,
-                             num,
-                             musaMemcpyHostToDevice,
-                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -608,8 +558,6 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                                        1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -796,11 +744,10 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {
+      dst_place.GetType() == phi::AllocationType::CPU) {  // NOLINT
     std::memcpy(dst, src, num);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == phi::AllocationType::GPUPINNED) {
     std::memcpy(dst, src, num);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6754c17978ea31..fe5fae7bafaebb 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -102,7 +102,7 @@ op_library(quantize_linear_op DEPS phi common)
 op_library(save_combine_op DEPS string_array phi common)
 op_library(load_combine_op DEPS string_array)
 
-if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if (WITH_GPU OR WITH_ROCM)
     register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS})
 endif()
 
@@ -110,7 +110,7 @@ if (WITH_MKLDNN)
     register_mkldnn_kernel(layer_norm_op SRCS layer_norm_mkldnn_op.cc DEPS ${OP_HEADER_DEPS})
 endif()
 
-if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if (WITH_GPU OR WITH_ROCM)
     op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS})
 elseif (WITH_XPU_KP)
     op_library(activation_op SRCS activation_op.cc activation_op.kps DEPS ${OP_HEADER_DEPS})
@@ -118,9 +118,9 @@ else()
     op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS})
 endif()
 
-if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if (WITH_GPU OR WITH_ROCM)
     op_library(sync_batch_norm_op DEPS processgroup_comm_utils)
-    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT WITH_MUSA) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
+    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
         op_library(sparse_attention_op DEPS processgroup_comm_utils)
     endif()
 endif()
@@ -152,10 +152,10 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} beam_search)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils)
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl)
 endif()
-if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if (WITH_GPU OR WITH_ROCM)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
 if(WITH_XPU)
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index dcbe58ffceb6a1..a07f311c6125ef 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 79e677034ce0f1..2c85ec6ea2076b 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -55,7 +55,7 @@ struct ArrayToLoDFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index c25344994cb503..012edde57294a9 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <cfloat>
 #include <string>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index aa03c2b57355c6..ecfae25270f911 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -19,14 +19,6 @@
 #include <hipcub/hipcub.hpp>
 typedef hiprandState curandState;
 namespace cub = hipcub;
-
-#elif defined(PADDLE_WITH_MUSA)
-#include <murand.h>
-#include <murand_kernel.h>
-
-#include <cub/cub.cuh>
-typedef murandState curandState;
-
 #else
 #include <curand.h>
 #include <curand_kernel.h>
@@ -42,7 +34,7 @@ typedef murandState curandState;
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -84,11 +76,6 @@ __global__ void RandomSampleClassCenter(const int64_t n,
   CUDA_KERNEL_LOOP(i, n) {
     buffer[i] = static_cast<T>(hiprand(&localState) % max_val);
   }
-#elif defined(PADDLE_WITH_MUSA)
-  murand_init(local_seed, id, increment, &localState);
-  CUDA_KERNEL_LOOP(i, n) {
-    buffer[i] = static_cast<T>(murand(&localState) % max_val);
-  }
 #else
   curand_init(local_seed, id, increment, &localState);
   CUDA_KERNEL_LOOP(i, n) {
@@ -365,7 +352,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
   phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device);
   T* num_classes_per_device_ptr = num_classes_per_device.data<T>();
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nranks > 1) {
     auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
     if (map->has(ring_id)) {
@@ -410,15 +397,15 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
 
       if (comm_ctx) {
         comm_ctx->AllReduce(
-            &num_classes_per_device, num_classes_per_device, mcclSum, stream);
+            &num_classes_per_device, num_classes_per_device, ncclSum, stream);
         paddle::platform::GpuStreamSync(stream);
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
             num_classes_per_device_ptr,
             num_classes_per_device_ptr,
             num_classes_per_device.numel(),
             phi::ToNCCLDataType(num_classes_per_device.dtype()),
-            mcclSum,
+            ncclSum,
             comm->comm(),
             stream));
       }
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index fdecbca81fc590..1c8c8f00217cc5 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -30,7 +30,7 @@ register_operators(
   DEPS
   ${COLLECTIVE_DEPS})
 
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi
                       common)
   op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index b554d658126f54..11b51602d4d75a 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/utils.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -33,12 +33,12 @@ template <typename T, typename DeviceContext>
 class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int send_numel = x->numel();
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
@@ -114,7 +114,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
       comm_ctx->GroupEnd();
       VLOG(3) << "new comm_context_manager has rid " << ring_id;
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
       for (auto i = 0; i < nranks; ++i) {
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
             send_buf + offset, send_numel, dtype, i, comm->comm(), stream));
@@ -122,7 +122,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
             recv_buf + offset, send_numel, dtype, i, comm->comm(), stream));
         offset += send_numel;
       }
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
       VLOG(3) << "old NCCLCommContext has rid " << ring_id;
     }
 #else
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index 2b1f04a491d5e3..210c42d30f6d50 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/barrier_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -30,12 +30,12 @@ template <typename T, typename DeviceContext>
 class BarrierOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data();
@@ -62,7 +62,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       auto stream = comm_ctx->GetStream();
-      mcclRedOp_t nccl_red_type = mcclSum;
+      ncclRedOp_t nccl_red_type = ncclSum;
       comm_ctx->AllReduce(out, *in, nccl_red_type, stream);
       platform::GpuStreamSync(stream);
       VLOG(3) << "new NCCLCommContext has rid " << rid;
@@ -70,8 +70,8 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
       auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
       // should ExecutionContext for calc stream.
       auto stream = ctx.cuda_device_context().stream();
-      mcclRedOp_t nccl_red_type = mcclSum;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff,
+      ncclRedOp_t nccl_red_type = ncclSum;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
                                                                   recvbuff,
                                                                   numel,
                                                                   dtype,
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 0de5e22aaabeb6..bd105c35886cb0 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -33,10 +33,10 @@ template <typename T, typename DeviceContext>
 class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -103,10 +103,10 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       comm_ctx->AllGather(out, *in, stream);
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::mcclAllGather(send_buff,
+          platform::dynload::ncclAllGather(send_buff,
                                            recv_buff,
                                            send_numel,
-                                           static_cast<mcclDataType_t>(dtype),
+                                           static_cast<ncclDataType_t>(dtype),
                                            comm->comm(),
                                            stream));
     }
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index b45f568b835f8d..277988b56916f8 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
                           ALL_LAYOUT,
                           ops::CAllReduceMaxCUDAKernel,
                           float,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           double,
                           int,
                           int64_t,
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 7bf5e59431f8ff..9cd472f4217881 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -25,14 +25,14 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -309,13 +309,13 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
       }
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int rid = ctx.Attr<int>("ring_id");
 
     auto place = ctx.GetPlace();
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data<T>();
@@ -395,22 +395,22 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
              << ", dtype:" << dtype << ", comm:" << comm
              << ", stream:" << stream;
 
-    mcclRedOp_t nccl_red_type = mcclSum;
+    ncclRedOp_t nccl_red_type = ncclSum;
     switch (red_type) {
       case kRedSum:
-        nccl_red_type = mcclSum;
+        nccl_red_type = ncclSum;
         break;
 
       case kRedMax:
-        nccl_red_type = mcclMax;
+        nccl_red_type = ncclMax;
         break;
 
       case kRedMin:
-        nccl_red_type = mcclMin;
+        nccl_red_type = ncclMin;
         break;
 
       case kRedProd:
-        nccl_red_type = mcclProd;
+        nccl_red_type = ncclProd;
         break;
 
       default:
@@ -421,7 +421,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
     if (comm_ctx) {
       comm_ctx->AllReduce(out, *in, nccl_red_type, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff,
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
                                                                   recvbuff,
                                                                   numel,
                                                                   dtype,
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index f886e4aaab212f..76d809cd234f03 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
                           ALL_LAYOUT,
                           ops::CAllReduceSumCUDAKernel,
                           float,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           double,
                           int,
                           int64_t,
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index 348c22bd8be48e..4d49bc4990c6ec 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -29,7 +29,7 @@ template <typename T, typename DeviceContext>
 class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
@@ -50,11 +50,11 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
     } else {
       // NOTE(liyurui): This will be removed after moving this operator to phi.
       int numel = x->numel();
-      mcclDataType_t dtype =
+      ncclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
       auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
       if (root == comm->rank()) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
             reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
             numel,
             dtype,
@@ -71,7 +71,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
               static_cast<phi::DenseTensor*>(out));
         }
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
             out->data<T>(), numel, dtype, root, comm->comm(), stream));
         VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
                 << common::product(out->dims());
@@ -100,8 +100,8 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast,
                           int64_t,
                           float,
                           double,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           plat::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index 2e84a0e80c2dcc..2dc9af01395468 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/platform/collective_helper.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
@@ -56,7 +56,7 @@ class CCommInitAllOp : public framework::OperatorBase {
     //                   platform::errors::PreconditionNotMet(
     //                       "CCommInitAllOp can run on gpu place only"));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     std::vector<int> devices = Attr<std::vector<int>>("devices");
     if (devices.empty()) {
       devices = platform::GetSelectedDevices();
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index 4d92c369abfebc..39d22fcd5f50d8 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -17,10 +17,6 @@ limitations under the License. */
 #if defined(PADDLE_WITH_RCCL)
 #include <rccl.h>
 #endif
-
-#if defined(PADDLE_WITH_MCCL)
-#include <mccl.h>
-#endif
 #include <stdint.h>
 
 #include <ostream>
@@ -32,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 // #include "paddle/fluid/operators/distributed/distributed.h"
 // #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -59,8 +55,8 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase {
     auto var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::InvalidArgument("Input X must be provided."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-    mcclUniqueId* nccl_id = var->GetMutable<mcclUniqueId>();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
 
     int ntrainers = Attr<int>("ntrainers");
     int train_id = Attr<int>("trainer_id");
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 3f7683fb405cb1..086257eab60383 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -17,11 +17,6 @@ limitations under the License. */
 #if defined(PADDLE_WITH_RCCL)
 #include <rccl.h>
 #endif
-
-#if defined(PADDLE_WITH_MCCL)
-#include <mccl.h>
-#endif
-
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -29,12 +24,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -97,8 +92,8 @@ class CCommInitOp : public framework::OperatorBase {
 #endif
     } else {
 // TODO(wangxi): Put this in the unified header file
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-      using UniqueId = mcclUniqueId;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+      using UniqueId = ncclUniqueId;
       using CommContext = platform::NCCLCommContext;
 #elif defined(PADDLE_WITH_XPU_BKCL)
       using UniqueId = BKCLUniqueId;
@@ -114,7 +109,7 @@ class CCommInitOp : public framework::OperatorBase {
           platform::errors::PreconditionNotMet(
               "CCommInitOp can run on gpu or xpu place only."));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
       auto var = scope.FindVar(Input("X"));
       PADDLE_ENFORCE_NOT_NULL(
@@ -150,7 +145,7 @@ class CCommInitOp : public framework::OperatorBase {
         return;
       }
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
       VLOG(3) << "#### use old comm lab ####";
       UniqueId* comm_id = var->GetMutable<UniqueId>();
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index f170e07b6532f9..d13179cbae48b1 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/api/include/tensor.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -38,7 +38,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -65,7 +65,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
                           rank,
                           nranks));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     phi::DenseTensor temp_out;
     framework::DDim temp_out_dims = x->dims();
     temp_out_dims[0] *= nranks;
@@ -130,10 +130,10 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
         comm_ctx->AllGather(&temp_out, *x, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::mcclAllGather(send_buff,
+            platform::dynload::ncclAllGather(send_buff,
                                              recv_buff,
                                              send_numel,
-                                             static_cast<mcclDataType_t>(dtype),
+                                             static_cast<ncclDataType_t>(dtype),
                                              comm->comm(),
                                              stream));
       }
@@ -175,8 +175,8 @@ PD_REGISTER_STRUCT_KERNEL(c_concat,
                           double,
                           int,
                           int64_t,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           plat::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 9851b9d9d9f685..4a07f7e98f793c 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -27,14 +27,14 @@ PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-static void GenNCCLID(std::vector<mcclUniqueId>* nccl_ids) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (auto& nccl_id : *nccl_ids) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
   }
 }
 
-static void CopyNCCLIDToVar(const std::vector<mcclUniqueId>& nccl_ids,
+static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
                             std::function<std::string(size_t)> func,
                             const framework::Scope& scope) {
   for (size_t i = 0; i < nccl_ids.size(); ++i) {
@@ -44,8 +44,8 @@ static void CopyNCCLIDToVar(const std::vector<mcclUniqueId>& nccl_ids,
         var,
         platform::errors::NotFound("Variable with name %s is not found",
                                    var_name.c_str()));
-    auto nccl_id = var->GetMutable<mcclUniqueId>();
-    memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId));
+    auto nccl_id = var->GetMutable<ncclUniqueId>();
+    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
   }
 }
 
@@ -68,7 +68,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
 
     std::string endpoint = Attr<std::string>("endpoint");
 
-    std::vector<mcclUniqueId> nccl_ids;
+    std::vector<ncclUniqueId> nccl_ids;
     nccl_ids.resize(1);
 
     if (!FLAGS_dynamic_static_unified_comm) {
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 26cacdd87fa863..20884d1ae8a969 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -26,14 +26,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -236,12 +236,12 @@ template <ReduceType red_type, typename T>
 class CReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data();
@@ -286,22 +286,22 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
       stream = ctx.cuda_device_context().stream();
     }
 
-    mcclRedOp_t nccl_red_type = mcclSum;
+    ncclRedOp_t nccl_red_type = ncclSum;
     switch (red_type) {
       case kRedSum:
-        nccl_red_type = mcclSum;
+        nccl_red_type = ncclSum;
         break;
 
       case kRedMax:
-        nccl_red_type = mcclMax;
+        nccl_red_type = ncclMax;
         break;
 
       case kRedMin:
-        nccl_red_type = mcclMin;
+        nccl_red_type = ncclMin;
         break;
 
       case kRedProd:
-        nccl_red_type = mcclProd;
+        nccl_red_type = ncclProd;
         break;
 
       default:
@@ -315,7 +315,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
     if (comm_ctx) {
       comm_ctx->Reduce(out, *in, nccl_red_type, root, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(sendbuff,
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(sendbuff,
                                                                recvbuff,
                                                                numel,
                                                                dtype,
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index af26bf7d858ba0..cd1cf0c0176363 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -30,7 +30,7 @@ template <typename T, typename DeviceContext>
 class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
@@ -105,14 +105,14 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
     if (comm_ctx) {
-      comm_ctx->ReduceScatter(out, *in, mcclSum, stream);
+      comm_ctx->ReduceScatter(out, *in, ncclSum, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduceScatter(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
           send_buff,
           recv_buff,
           recv_numel,
-          static_cast<mcclDataType_t>(dtype),
-          mcclSum,
+          static_cast<ncclDataType_t>(dtype),
+          ncclSum,
           comm->comm(),
           stream));
     }
@@ -135,9 +135,9 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter,
                           ops::CReduceScatterOpCUDAKernel,
                           float,
                           double,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
index 86bb602256aefb..7f4b4f6734de0c 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_scatter_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -30,11 +30,11 @@ template <typename T, typename DeviceContext>
 class CScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int numel = x->numel();
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -123,7 +123,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
       }
     } else {
       if (root_id == comm->rank()) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
             reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
             numel,
             dtype,
@@ -137,7 +137,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
             *platform::DeviceContextPool::Instance().Get(place),
             static_cast<phi::DenseTensor*>(&temp));
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
             out_ptr, numel, dtype, root_id, comm->comm(), stream));
       }
     }
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index 7ea80d8a54e9ad..f8f43d5c9da48c 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/softmax_impl.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
@@ -208,17 +208,17 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
         eigen_logits.maximum(along_axis);
 
     if (comm_ctx) {
-      comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream);
+      comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
     } else {
       void* logits_max_buff = logits_max.mutable_data<T>(place);
 
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           logits_max_buff,
           logits_max_buff,
           logits_max.numel(),
           platform::ToNCCLDataType(
               framework::TransToProtoVarType(logits_max.dtype())),
-          mcclMax,
+          ncclMax,
           comm->comm(),
           stream));
     }
@@ -273,16 +273,16 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
 
     predicted_logits.mutable_data<T>(place);
     if (comm_ctx) {
-      comm_ctx->AllReduce(&predicted_logits, predicted_logits, mcclSum, stream);
+      comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream);
     } else {
       void* predict_logits_buff = predicted_logits.data();
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           predict_logits_buff,
           predict_logits_buff,
           predicted_logits.numel(),
           platform::ToNCCLDataType(
               framework::TransToProtoVarType(predicted_logits.dtype())),
-          mcclSum,
+          ncclSum,
           comm->comm(),
           stream));
     }
@@ -301,16 +301,16 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
         eigen_softmax.sum(along_axis);
 
     if (comm_ctx) {
-      comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream);
+      comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
     } else {
       void* sum_exp_logits_buff = sum_exp_logits.data();
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
           sum_exp_logits_buff,
           sum_exp_logits_buff,
           sum_exp_logits.numel(),
           platform::ToNCCLDataType(
               framework::TransToProtoVarType(sum_exp_logits.dtype())),
-          mcclSum,
+          ncclSum,
           comm->comm(),
           stream));
     }
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index 79c32bc907045f..e100397924af56 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -39,7 +39,7 @@ template <typename T, typename DeviceContext>
 class CSyncCalcStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
 
     auto place = ctx.GetPlace();
     auto dev_ctx = static_cast<phi::GPUContext*>(
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
index 52f4e6f6d88fee..8d60d633272a98 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
@@ -18,14 +18,14 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -40,7 +40,7 @@ template <typename T, typename DeviceContext>
 class CSyncCommStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto place = ctx.GetPlace();
     int ring_id = ctx.Attr<int>("ring_id");
 
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index c97da1a737b0f2..f2eab0532b9df2 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -19,7 +19,7 @@ namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -47,7 +47,7 @@ class CWaitCommOp : public framework::OperatorBase {
             "wait_comm op can run on gpu place only for now, but got %s",
             place.DebugString()));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     int ring_id = Attr<int>("ring_id");
 
     gpuStream_t compute_stream =
@@ -89,9 +89,6 @@ class CWaitCommOp : public framework::OperatorBase {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 3088e1ed61d66e..33b56cbe6581d0 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -19,7 +19,7 @@ namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -47,7 +47,7 @@ class CWaitComputeOp : public framework::OperatorBase {
             "wait_compute op can run on gpu place only for now, but got %s",
             place.DebugString()));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     int ring_id = Attr<int>("ring_id");
 
     gpuStream_t compute_stream =
@@ -89,9 +89,6 @@ class CWaitComputeOp : public framework::OperatorBase {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
-#elif defined(PADDLE_WITH_MUSA)    
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index da13a5ba800a63..1d03cb151e4a01 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -34,14 +34,14 @@ class Scope;
 namespace paddle {
 namespace operators {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-static void GenNCCLID(std::vector<mcclUniqueId>* nccl_ids) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
   for (auto& nccl_id : *nccl_ids) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
   }
 }
 
-static void CopyNCCLIDToVar(const std::vector<mcclUniqueId>& nccl_ids,
+static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
                             std::function<std::string(size_t)> func,
                             const framework::Scope& scope) {
   for (size_t i = 0; i < nccl_ids.size(); ++i) {
@@ -51,8 +51,8 @@ static void CopyNCCLIDToVar(const std::vector<mcclUniqueId>& nccl_ids,
         var,
         platform::errors::NotFound("Variable with name %s is not found",
                                    var_name.c_str()));
-    auto nccl_id = var->GetMutable<mcclUniqueId>();
-    memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId));
+    auto nccl_id = var->GetMutable<ncclUniqueId>();
+    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
   }
 }
 
@@ -130,7 +130,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
             << ", trainers:" << ss.str();
 
     int server_fd = -1;
-    std::vector<mcclUniqueId> nccl_ids;
+    std::vector<ncclUniqueId> nccl_ids;
     nccl_ids.resize(nccl_comm_num);
 
     /// 1. init flat
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index a1e09d2c35cbb8..7a9c02628088fd 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/global_gather_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -31,8 +31,8 @@ namespace operators {
 template <typename T>
 struct GlobalGatherFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-// #if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -73,7 +73,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
       cpu_global_count_data = cpu_global_count.data<int64_t>();
     }
 
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
@@ -165,11 +165,11 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
       auto send_buf = x->data<T>();
       auto recv_buf = out->data<T>();
       for (auto i = 0; i < n_expert; ++i) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
         for (auto j = 0; j < nranks; ++j) {
           int idx = i + j * n_expert;
           if (cpu_global_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
                 send_buf + send_ptr * in_feat,
                 cpu_global_count_data[idx] * in_feat,
                 dtype,
@@ -179,7 +179,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
             send_ptr += cpu_global_count_data[idx];
           }
           if (cpu_local_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
                 recv_buf + expert_ptr[idx] * in_feat,
                 cpu_local_count_data[idx] * in_feat,
                 dtype,
@@ -188,13 +188,13 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
                 stream));
           }
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
       }
     }
-// #else
-    // PADDLE_THROW(
-        // platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-// #endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
@@ -205,8 +205,8 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
 template <typename T>
 struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-// #if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -304,16 +304,14 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
 
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
 
-// #else
-//     PADDLE_THROW(
-//         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-// #endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 38a992d3baaa31..6b915d35be0430 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,8 +32,8 @@ namespace operators {
 template <typename T>
 struct GlobalScatterFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-// #if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -72,7 +72,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
       global_count_len = cpu_global_count.numel();
     }
 
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
@@ -173,11 +173,11 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
       auto recv_buf = out->data<T>();
 
       for (auto i = 0; i < n_expert; ++i) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
         for (auto j = 0; j < nranks; ++j) {
           int idx = i + j * n_expert;
           if (cpu_local_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
                 send_buf + expert_ptr[idx] * in_feat,
                 cpu_local_count_data[idx] * in_feat,
                 dtype,
@@ -186,7 +186,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
                 stream));
           }
           if (cpu_global_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
                 recv_buf + recv_ptr * in_feat,
                 cpu_global_count_data[idx] * in_feat,
                 dtype,
@@ -196,14 +196,14 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
             recv_ptr += cpu_global_count_data[idx];
           }
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
       }
     }
 
-// #else
-//     PADDLE_THROW(
-//         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-// #endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
@@ -214,8 +214,8 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
 template <typename T>
 struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-// #if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -311,16 +311,14 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
 
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
 
-// #else
-//     PADDLE_THROW(
-//         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-// #endif
+#else
+    PADDLE_THROW(
+        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
index d53a92369df401..b4773a8eb54562 100644
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
@@ -31,8 +31,8 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
                           double,
                           int,
                           int64_t,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           plat::float16) {
 }
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index 863850b6e38396..b0cdabce48503a 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_allgather_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,11 +32,11 @@ template <typename T, typename DeviceContext>
 class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int64_t numel = in->numel();
-    mcclDataType_t dtype =
+    ncclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -128,10 +128,10 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
         const T* send_buff = in->data<T>() + offset;
         T* recv_buff = out->data<T>();
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::mcclAllGather(send_buff,
+            platform::dynload::ncclAllGather(send_buff,
                                              recv_buff,
                                              send_numel,
-                                             static_cast<mcclDataType_t>(dtype),
+                                             static_cast<ncclDataType_t>(dtype),
                                              comm->comm(),
                                              stream));
       }
@@ -155,9 +155,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather,
                           ops::PartialAllGatherOpCUDAKernel,
                           float,
                           double,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index fdfb31e7b2eab1..c8844058696e14 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_recv_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,8 +32,8 @@ template <typename T, typename DeviceContext>
 class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) 
-    // NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
     auto out = ctx.Output<phi::DenseTensor>("Out");
     auto out_dims = out->dims();
     auto numel = out->numel();
@@ -142,7 +142,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
                             peer,
                             nranks));
 
-      mcclDataType_t dtype = platform::ToNCCLDataType(type);
+      ncclDataType_t dtype = platform::ToNCCLDataType(type);
 
       if (comm_ctx) {
         auto recv_buf = distributed::GetPartialTensor(*out, offset, recv_numel);
@@ -150,7 +150,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
         comm_ctx->Recv(&recv_buf, recv_numel, peer, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::mcclRecv(out->data<T>() + offset,
+            platform::dynload::ncclRecv(out->data<T>() + offset,
                                         recv_numel,
                                         dtype,
                                         peer,
@@ -180,9 +180,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv,
                           ops::PartialRecvOpCUDAKernel,
                           float,
                           double,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index d395f3a5febb34..39858b3ed37a26 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_send_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,8 +32,8 @@ template <typename T, typename DeviceContext>
 class PartialSendCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
-    // NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     int numel = x->numel();
     int rid = ctx.Attr<int>("ring_id");
@@ -136,7 +136,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
                             peer,
                             nranks));
 
-      mcclDataType_t dtype =
+      ncclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
       if (comm_ctx) {
@@ -145,7 +145,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
         comm_ctx->Send(send_buf, send_numel, peer, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::mcclSend(x->data<T>() + offset,
+            platform::dynload::ncclSend(x->data<T>() + offset,
                                         send_numel,
                                         dtype,
                                         peer,
@@ -176,9 +176,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_send,
                           ops::PartialSendCUDAKernel,
                           float,
                           double,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 283e75d7a53e87..41c2e70df8c35f 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
@@ -29,7 +29,8 @@ PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
 framework::DDim recv_shape_info(const platform::Place &place,
                                 const gpuStream_t &stream,
                                 platform::NCCLComm *comm,
@@ -46,7 +47,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
   }
 
   phi::DataType shape_dtype = phi::DataType::INT32;
-  mcclDataType_t nccl_dtype =
+  ncclDataType_t nccl_dtype =
       platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype));
 
   // step1: recv the shape size
@@ -59,7 +60,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
     if (comm_ctx) {
       comm_ctx->Recv(&gpu_shape_size_tensor, 1, peer, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           gpu_data, 1, nccl_dtype, peer, comm->comm(), stream));
     }
   }
@@ -89,7 +90,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
     if (comm_ctx) {
       comm_ctx->Recv(&gpu_shape_tensor, shape_size, peer, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           gpu_shape_data, shape_size, nccl_dtype, peer, comm->comm(), stream));
     }
   }
@@ -123,7 +124,8 @@ template <typename T, typename DeviceContext>
 class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
     int rid = ctx.Attr<int>("ring_id");
     bool dynamic_shape = ctx.Attr<bool>("dynamic_shape");
     PADDLE_ENFORCE_GE(
@@ -214,7 +216,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     int data_type = ctx.Attr<int>("dtype");
     framework::proto::VarType::Type type =
         framework::proto::VarType::Type(data_type);
-    mcclDataType_t dtype = platform::ToNCCLDataType(type);
+    ncclDataType_t dtype = platform::ToNCCLDataType(type);
 
     auto *out_var = ctx.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensorArray>()) {
@@ -233,7 +235,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
         if (comm_ctx) {
           comm_ctx->Recv(out, numel, peer, stream);
         } else {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
               out->data<T>(), numel, dtype, peer, comm->comm(), stream));
           VLOG(3) << "rank " << comm->rank() << " recv "
                   << common::product(out_dims) << " from " << peer;
@@ -272,7 +274,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
                             "be less than comm->nranks (%d).",
                             peer,
                             comm->nranks()));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           out->data<T>(), numel, dtype, peer, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " recv "
               << common::product(out->dims()) << " from " << peer;
@@ -297,9 +299,9 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2,
                           ops::RecvOpV2CUDAKernel,
                           float,
                           double,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           int,
                           int64_t,
                           int8_t,
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 5ad3124b32017d..86be6908e3cd28 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/send_v2_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
@@ -28,7 +28,8 @@ PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) 
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
 void send_shape_info(const phi::DenseTensor& x,
                      const platform::Place& place,
                      const gpuStream_t& stream,
@@ -45,7 +46,7 @@ void send_shape_info(const phi::DenseTensor& x,
             "to send the shape info."));
   }
   phi::DataType shape_dtype = phi::DataType::INT32;
-  mcclDataType_t nccl_dtype =
+  ncclDataType_t nccl_dtype =
       platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype));
   auto dims = x.dims();
   int shape_size = dims.size();
@@ -72,7 +73,7 @@ void send_shape_info(const phi::DenseTensor& x,
       comm_ctx->Send(*gpu_shape_size_tensor, 1, peer, stream);
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::mcclSend(gpu_shape_size_tensor->data<int>(),
+          platform::dynload::ncclSend(gpu_shape_size_tensor->data<int>(),
                                       1,
                                       nccl_dtype,
                                       peer,
@@ -105,7 +106,7 @@ void send_shape_info(const phi::DenseTensor& x,
       comm_ctx->Send(*gpu_shape_tensor, shape_size, peer, stream);
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::mcclSend(gpu_shape_tensor->data<int>(),
+          platform::dynload::ncclSend(gpu_shape_tensor->data<int>(),
                                       shape_size,
                                       nccl_dtype,
                                       peer,
@@ -121,7 +122,8 @@ template <typename T, typename DeviceContext>
 class SendOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
     int rid = ctx.Attr<int>("ring_id");
     bool dynamic_shape = ctx.Attr<bool>("dynamic_shape");
     PADDLE_ENFORCE_GE(
@@ -215,12 +217,12 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         VLOG(3) << "LodTensorArray: idx(" << idx << ")";
         auto& x = x_array.at(idx);
         int numel = x.numel();
-        mcclDataType_t dtype =
+        ncclDataType_t dtype =
             platform::ToNCCLDataType(framework::TransToProtoVarType(x.dtype()));
         if (comm_ctx) {
           comm_ctx->Send(x, numel, peer, stream);
         } else {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
               x.data<T>(), numel, dtype, peer, comm->comm(), stream));
         }
         VLOG(3) << "rank " << comm->rank() << " send "
@@ -245,9 +247,9 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     if (comm_ctx) {
       comm_ctx->Send(*x, numel, peer, stream);
     } else {
-      mcclDataType_t dtype =
+      ncclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
           x->data<T>(), numel, dtype, peer, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " send "
               << common::product(x->dims()) << " to " << peer;
@@ -272,9 +274,9 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
                           ops::SendOpV2CUDAKernel,
                           float,
                           double,
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-// #endif
+#endif
                           int,
                           int64_t,
                           int8_t,
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index d5419d2b13a4e0..0f04a295ed263f 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -77,7 +77,7 @@ class ConditionalOp : public framework::OperatorBase {
                           ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       phi::DenseTensor cpu_tensor;
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index b44be01ca1a8e2..94b946e43dc7a1 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -222,7 +222,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     GPU,
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 3fb50e695d1a36..9262ca59af970b 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -26,7 +26,7 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 }  // namespace paddle
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -34,7 +34,7 @@ namespace paddle {
 namespace operators {
 
 static size_t CUDADevCount() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   return platform::GetGPUDeviceCount();
 #else
   return 0UL;
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index ef0dccff7197f0..8ddce0da7faacc 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -227,7 +227,7 @@ bool GetCondData(const phi::DenseTensor &cond) {
   // when platform::is_gpu_place(cond.place()) or
   // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index da1eec366937d8..509c067e24e421 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
@@ -216,7 +216,7 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
         d_batch_square_sum);
 
     if (need_sync_stats) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       int rid = 0;
       platform::NCCLComm *comm = nullptr;
       const auto &comm_context_manager =
@@ -247,59 +247,59 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
       }
 
       if (comm_ctx) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             reinterpret_cast<const void *>(d_batch_size),
             reinterpret_cast<void *>(d_batch_size),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            mcclSum,
+            ncclSum,
             comm_ctx->GetNcclComm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             reinterpret_cast<const void *>(d_batch_sum),
             reinterpret_cast<void *>(d_batch_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            mcclSum,
+            ncclSum,
             comm_ctx->GetNcclComm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             reinterpret_cast<const void *>(d_batch_square_sum),
             reinterpret_cast<void *>(d_batch_square_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            mcclSum,
+            ncclSum,
             comm_ctx->GetNcclComm(),
             stream));
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             reinterpret_cast<const void *>(d_batch_size),
             reinterpret_cast<void *>(d_batch_size),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            mcclSum,
+            ncclSum,
             comm->comm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             reinterpret_cast<const void *>(d_batch_sum),
             reinterpret_cast<void *>(d_batch_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            mcclSum,
+            ncclSum,
             comm->comm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
             reinterpret_cast<const void *>(d_batch_square_sum),
             reinterpret_cast<void *>(d_batch_square_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            mcclSum,
+            ncclSum,
             comm->comm(),
             stream));
       }
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 688178ac7b5825..d38a72556f7596 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -11,7 +11,7 @@ function(detection_library TARGET_NAME)
   set(srcs)
   # filter cuda source file when not build with cuda/rocm
   foreach(src ${detection_library_SRCS})
-    if(NOT WITH_GPU AND NOT WITH_ROCM  AND NOT WITH_MUSA)
+    if(NOT WITH_GPU AND NOT WITH_ROCM)
       if(${src} MATCHES ".*\\.cc$")
         list(APPEND srcs ${src})
       endif()
@@ -57,7 +57,7 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
 detection_library(retinanet_detection_output_op SRCS
                   retinanet_detection_output_op.cc)
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   set(TMPDEPS memory)
   if(WITH_GPU)
     if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index 945678dfd96acd..adb60a8a8d0642 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <cfloat>
 #include <string>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index 6f203e9cca7379..b2bbd9c82095c8 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
index 807f7e907e5ce4..d954ea1bf82af7 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/clip_by_norm_op.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/clip_by_norm_kernel.h"
 #include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h"
 
@@ -26,49 +25,48 @@ template <typename T, typename DeviceContext>
 class DGCClipByNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(false, "not supported");
-    // auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
-    // if (static_cast<int>(rampup_begin_step) < 0) {
-    //   return;
-    // }
+    auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
+    if (static_cast<int>(rampup_begin_step) < 0) {
+      return;
+    }
 
-    // auto current_step_tensor = ctx.Input<phi::DenseTensor>("current_step");
-    // auto* current_step = current_step_tensor->data<T>();
+    auto current_step_tensor = ctx.Input<phi::DenseTensor>("current_step");
+    auto* current_step = current_step_tensor->data<T>();
 
-    // VLOG(10) << "current_step:" << *current_step
-    //          << ", rampup_begin_step:" << rampup_begin_step;
+    VLOG(10) << "current_step:" << *current_step
+             << ", rampup_begin_step:" << rampup_begin_step;
 
-    // if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
-    //   VLOG(10) << "current_step:" << *current_step
-    //            << " < rampup_begin_step:" << rampup_begin_step
-    //            << " so does't use dgc_clip_by_norm";
-    //   return;
-    // }
+    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
+      VLOG(10) << "current_step:" << *current_step
+               << " < rampup_begin_step:" << rampup_begin_step
+               << " so does't use dgc_clip_by_norm";
+      return;
+    }
 
-    // auto in_var = ctx.InputVar("X");
-    // auto max_norm = ctx.Attr<float>("max_norm");
-    // auto& dev_ctx = ctx.device_context<DeviceContext>();
+    auto in_var = ctx.InputVar("X");
+    auto max_norm = ctx.Attr<float>("max_norm");
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-    // if (in_var->IsType<phi::DenseTensor>()) {
-    //   auto* x = ctx.Input<phi::DenseTensor>("X");
-    //   auto* y = ctx.Output<phi::DenseTensor>("Out");
-    //   return phi::ClipByNormKernel<T>(
-    //       static_cast<const typename framework::ConvertToPhiContext<
-    //           DeviceContext>::TYPE&>(dev_ctx),
-    //       *x,
-    //       max_norm,
-    //       y);
-    // } else if (in_var->IsType<phi::SelectedRows>()) {
-    //   auto* x = ctx.Input<phi::SelectedRows>("X");
-    //   phi::SelectedRows* output_selected_rows =
-    //       ctx.Output<phi::SelectedRows>("Out");
-    //   return phi::sr::ClipByNormKernel<T>(
-    //       static_cast<const typename framework::ConvertToPhiContext<
-    //           DeviceContext>::TYPE&>(dev_ctx),
-    //       *x,
-    //       max_norm,
-    //       output_selected_rows);
-    // }
+    if (in_var->IsType<phi::DenseTensor>()) {
+      auto* x = ctx.Input<phi::DenseTensor>("X");
+      auto* y = ctx.Output<phi::DenseTensor>("Out");
+      return phi::ClipByNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x,
+          max_norm,
+          y);
+    } else if (in_var->IsType<phi::SelectedRows>()) {
+      auto* x = ctx.Input<phi::SelectedRows>("X");
+      phi::SelectedRows* output_selected_rows =
+          ctx.Output<phi::SelectedRows>("Out");
+      return phi::sr::ClipByNormKernel<T>(
+          static_cast<const typename framework::ConvertToPhiContext<
+              DeviceContext>::TYPE&>(dev_ctx),
+          *x,
+          max_norm,
+          output_selected_rows);
+    }
   };
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 1b2dc157fb4022..face0f758f8484 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -32,14 +32,11 @@ limitations under the License. */
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
 #include <cuda.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
-#elif defined(__MUSACC__)
-#include <musa.h>
-#include <musa_runtime.h>
 #endif
 #include <thrust/iterator/iterator_adaptor.h>
 
@@ -314,7 +311,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T,
           typename CompoundFunctor,
           bool BcastY,
@@ -519,7 +516,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     int h = pre;
     int w = n;
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
       FusedElemwiseAndActBroadcast1CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -554,7 +551,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
       FusedElemwiseAndActBroadcast2CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -883,7 +880,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T,
           typename DX_OP,
           typename DY_OP,
@@ -1276,7 +1273,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     int w = n;
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
       FusedElemwiseAndActGradBroadcast1CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1327,7 +1324,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
       FusedElemwiseAndActGradBroadcast2CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1597,7 +1594,7 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
   return phi::funcs::GetReduceDim(in, out, axis);
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
 template <typename T, typename Functor>
 void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 8be70c6fc8e933..4c2dd992657812 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, double>,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL(
     expand,
     ops::ExpandKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 976ce30d2f0be9..bdf8a80debb649 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -193,8 +193,6 @@ struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
 
 #ifdef PADDLE_WITH_HIP
       hipMemset(out_abs_max, 0, sizeof(T) * cout);
-#elif defined(PADDLE_WITH_MUSA)      
-      musaMemset(out_abs_max, 0, sizeof(T) * cout);
 #else
       cudaMemset(out_abs_max, 0, sizeof(T) * cout);
 #endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 942dd94f4dca22..ced20a0108a527 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -32,16 +32,16 @@ if(WITH_XPU)
   op_library(fused_feedforward_op)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   # fused_bn_activation_op needs cudnn 7.4.1 above
   # HIP not support bn act fuse in MIOPEN
-  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
     op_library(fused_bn_activation_op)
   endif()
   # HIP not support cudnnTransformTensor
   # fusion_conv_inception_op needs cudnn 7 above
   # HIP not support cudnnConvolutionBiasActivationForward
-  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
     op_library(fusion_conv_inception_op)
   endif()
   op_library(yolo_box_head_op)
@@ -53,12 +53,12 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   endif()
   # fused_bn_add_activation
   # HIP not support bn act fuse in MIOPEN
-  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
     op_library(fused_bn_add_activation_op)
   endif()
   # fused_dropout
   # only support CUDA
-  if(NOT WITH_ROCM AND NOT WITH_MUSA)
+  if(NOT WITH_ROCM)
     op_library(fused_feedforward_op)
     # fused_attention_op
     op_library(fused_attention_op)
@@ -66,7 +66,7 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     op_library(fused_multi_transformer_int8_op)
   endif()
   # resnet_unit needs cudnn 8.0 above
-  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
+  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
     op_library(resnet_unit_op)
   endif()
 
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 6b3e435529e715..8ea1e11cd29f41 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h
index c37b6e2307b585..b198c4a5792912 100644
--- a/paddle/fluid/operators/fused/fused_attention_utils.h
+++ b/paddle/fluid/operators/fused/fused_attention_utils.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -34,7 +34,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int ring_id,
                       const phi::GPUContext &dev_ctx) {
   if (ring_id == -1) return;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
 
   if (map->has(ring_id)) {
@@ -86,10 +86,10 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
       VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
     }
     if (comm_ctx) {
-      comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream);
+      comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
-          sendbuff, recvbuff, numel, dtype, mcclSum, comm->comm(), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+          sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
     }
   }
 #else
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 7081180ea67667..ccd099109487c9 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <cooperative_groups.h>
-#include <musa.h>
-#include <murand_kernel.h>
+#include <cuda.h>
+#include <curand_kernel.h>
 
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index ad73be604fddb2..40717402846db5 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <musa_fp16.h>
+#include <cuda_fp16.h>
 #include <float.h>
 
 #include <cub/cub.cuh>
@@ -39,7 +39,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/fusion/gpu/attn_gemm.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -61,7 +61,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int count,
                       const phi::GPUContext &ctx) {
   if (ring_id == -1) return;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
 
   if (map->has(ring_id)) {
@@ -117,10 +117,10 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
       VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
     }
     if (comm_ctx) {
-      comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream);
+      comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
-          sendbuff, recvbuff, count, dtype, mcclSum, comm->comm(), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+          sendbuff, recvbuff, count, dtype, ncclSum, comm->comm(), stream));
     }
   }
 #else
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index e78579a27c1a94..362860aa23bdf7 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -150,34 +150,6 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
-#elif defined(PADDLE_WITH_MUSA)         
-  T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
-  platform::GpuMemcpyAsync(gpu_input_values,
-                           input_data.data(),
-                           input_data.size() * sizeof(T *),
-                           musaMemcpyHostToDevice,
-                           stream);
-  T **gpu_output_values =
-      reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
-  platform::GpuMemcpyAsync(gpu_output_values,
-                           output_data.data(),
-                           output_data.size() * sizeof(T *),
-                           musaMemcpyHostToDevice,
-                           stream);
-  T **gpu_seqpool_output_values =
-      reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
-  platform::GpuMemcpyAsync(gpu_seqpool_output_values,
-                           seqpool_output_data.data(),
-                           seqpool_output_data.size() * sizeof(T *),
-                           musaMemcpyHostToDevice,
-                           stream);
-  size_t **lods_values = reinterpret_cast<size_t **>(
-      &gpu_seqpool_output_values[seqpool_output_data.size()]);
-  platform::GpuMemcpyAsync(lods_values,
-                           lods.data(),
-                           lods.size() * sizeof(size_t *),
-                           musaMemcpyHostToDevice,
-                           stream);                  
 #else
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_input_values,
@@ -384,37 +356,6 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
-#elif defined(PADDLE_WITH_MUSA)
-  T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
-  platform::GpuMemcpyAsync(gpu_out_grads_values,
-                           out_grads_data.data(),
-                           out_grads_data.size() * sizeof(T *),
-                           musaMemcpyHostToDevice,
-                           stream);
-
-  T **gpu_in_grads_values =
-      reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
-  platform::GpuMemcpyAsync(gpu_in_grads_values,
-                           in_grads_data.data(),
-                           in_grads_data.size() * sizeof(T *),
-                           musaMemcpyHostToDevice,
-                           stream);
-
-  T **gpu_cvm_values =
-      reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
-  platform::GpuMemcpyAsync(gpu_cvm_values,
-                           cvm_data.data(),
-                           cvm_data.size() * sizeof(T *),
-                           musaMemcpyHostToDevice,
-                           stream);
-
-  size_t **lods_values =
-      reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
-  platform::GpuMemcpyAsync(lods_values,
-                           lods.data(),
-                           lods.size() * sizeof(size_t *),
-                           musaMemcpyHostToDevice,
-                           stream);                           
 #else
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_out_grads_values,
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index c6fe13548033ac..72bb97a2aae9ee 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -255,9 +255,6 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemcpy(
-      bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice);      
 #else
   cudaMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice);
@@ -271,9 +268,6 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemcpy(
-      &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost);  
 #else
   cudaMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost);
@@ -289,9 +283,6 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
     hipFree(bbox_tensor);
     hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
-#elif defined(PADDLE_WITH_MUSA)
-    musaFree(bbox_tensor);
-    musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));    
 #else
     cudaFree(bbox_tensor);
     cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
@@ -305,9 +296,6 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemcpy(
-      bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice);
 #else
   cudaMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice);
@@ -368,13 +356,6 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
               anchors.data(),
               anchors.size() * sizeof(int),
               hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)      
-    musaMalloc(reinterpret_cast<void**>(&device_anchors),
-               anchors.size() * sizeof(int));
-    musaMemcpy(device_anchors,
-               anchors.data(),
-               anchors.size() * sizeof(int),
-               musaMemcpyHostToDevice);        
 #else
     cudaMalloc(reinterpret_cast<void**>(&device_anchors),
                anchors.size() * sizeof(int));
@@ -407,10 +388,6 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
       hipMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
           ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
-#elif defined(PADDLE_WITH_MUSA)
-      musaMalloc(
-          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
-          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
 #else
       cudaMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
@@ -421,9 +398,6 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       hipMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                 sizeof(int));
-#elif defined(PADDLE_WITH_MUSA)
-      musaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
-                 sizeof(int));
 #else
       cudaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                  sizeof(int));
@@ -435,8 +409,6 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     int* bbox_index_device_ptr;
 #ifdef PADDLE_WITH_HIP
     hipMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
-#elif defined(PADDLE_WITH_MUSA)
-    musaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #else
     cudaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #endif
@@ -484,12 +456,6 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
             ts_info[ts_id].bboxes_dev_ptr,
             ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
             hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-        musaMemcpyAsync(
-            ts_info[ts_id].bboxes_host_ptr,
-            ts_info[ts_id].bboxes_dev_ptr,
-            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
-            musaMemcpyDeviceToHost);            
 #else
         cudaMemcpyAsync(
             ts_info[ts_id].bboxes_host_ptr,
@@ -568,8 +534,6 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
     hipFree(bbox_index_device_ptr);
-#elif defined(PADDLE_WITH_MUSA)
-    musaFree(bbox_index_device_ptr);
 #else
     cudaFree(bbox_index_device_ptr);
 #endif
@@ -577,9 +541,6 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       hipFree(ts_info[i].bboxes_dev_ptr);
       hipFree(ts_info[i].bbox_count_device_ptr);
-#elif defined(PADDLE_WITH_MUSA)
-      musaFree(ts_info[i].bboxes_dev_ptr);
-      musaFree(ts_info[i].bbox_count_device_ptr);
 #else
       cudaFree(ts_info[i].bboxes_dev_ptr);
       cudaFree(ts_info[i].bbox_count_device_ptr);
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index c6a8a4fe7b9822..8ae92b04b7df44 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index b45fdd9619a61d..b4e0f511f6d61b 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -32,9 +32,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
-#elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
-#include <murand_kernel.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -98,12 +95,6 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
                threadIdx.y * WARP_SIZE + threadIdx.x,
                0,
                &rng);
-#elif defined(PADDLE_WITH_MUSA)  
-  murandState rng;
-  murand_init(rand_seed * gridDim.x + blockIdx.x,
-               threadIdx.y * WARP_SIZE + threadIdx.x,
-               0,
-               &rng);
 #else
   curandState rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -137,8 +128,6 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
       for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
-#elif defined(PADDLE_WITH_MUSA)
-        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index 3530beda000b4e..c88d36602bd79c 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#ifndef PADDLE_WITH_HIP
 // HIP not support cudnnSpatialTfGridGeneratorForward
 
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index ea38db87e63e7d..dea3ce3fe695b8 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index e1e9ca5ef66673..8c123bb8a32f22 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 5c03b7395a4f24..5352ccc99df92e 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor);
 FiniteVisitor(Isnan, Any, CPU);
 FiniteVisitor(Isinf, Any, CPU);
 FiniteVisitor(Isfinite, All, CPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 FiniteVisitor(Isnan, Any, GPU);
 FiniteVisitor(Isinf, Any, GPU);
 FiniteVisitor(Isfinite, All, GPU);
@@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
                         IsnanVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsnanVisitorGPU(tensor, out));
@@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
                         IsinfVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsinfVisitorGPU(tensor, out));
@@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
                         IsfiniteVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsfiniteVisitorGPU(tensor, out));
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 3918ba54599808..8f0b705c8de79f 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index 197aaa74bb3e13..dd85ccff87f2d2 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index da8ea875e93938..94b03197291174 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(dev_ctx));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       Apply(static_cast<phi::GPUContext *>(dev_ctx));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 3f0ccf3bf40ffb..edd8b20da160c5 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -221,9 +221,6 @@ struct LookupTableV2GradCUDAFunctor {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));      
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 216e9863a5e277..75ef56accb10b4 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -16,8 +16,6 @@
 #ifdef PADDLE_WITH_HIP
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
-#elif defined(PADDLE_WITH_MUSA)
-
 #else
 #include <cub/cub.cuh>
 #endif
@@ -38,7 +36,7 @@ namespace cub = hipcub;
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -74,7 +72,7 @@ void GetClassInterval(const gpuStream_t& stream,
     return;
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   DenseTensor num_classes_per_device;
   phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device);
   int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
@@ -125,15 +123,15 @@ void GetClassInterval(const gpuStream_t& stream,
     if (comm_ctx) {
       comm_ctx->AllReduce(&num_classes_per_device,
                           num_classes_per_device,
-                          mcclSum,
+                          ncclSum,
                           calcu_stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
           num_classes_per_device_ptr,
           num_classes_per_device_ptr,
           num_classes_per_device.numel(),
           phi::ToNCCLDataType(num_classes_per_device.dtype()),
-          mcclSum,
+          ncclSum,
           comm->comm(),
           calcu_stream));
     }
@@ -272,7 +270,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
                               DenseTensor* loss) {
   const auto& place = dev_ctx.GetPlace();  // old code
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   paddle::platform::NCCLComm* comm = nullptr;
   const auto& comm_context_manager =
       phi::distributed::CommContextManager::GetInstance();
@@ -407,7 +405,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
           phi::kps::IdentityFunctor<T>(),
           {1});
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nranks > 1) {
     if (pg) {
       std::vector<phi::DenseTensor> in_tensor;
@@ -421,14 +419,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       task->Wait();
     } else {
       if (comm_ctx) {
-        comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream);
+        comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::mcclAllReduce(logits_max_buff,
+            phi::dynload::ncclAllReduce(logits_max_buff,
                                         logits_max_buff,
                                         logits_max.numel(),
                                         phi::ToNCCLDataType(logits_max.dtype()),
-                                        mcclMax,
+                                        ncclMax,
                                         comm->comm(),
                                         stream));
       }
@@ -452,7 +450,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       phi::kps::ExpFunctor<T>(),
       {1});
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nranks > 1) {
     if (pg) {
       std::vector<phi::DenseTensor> in_tensor;
@@ -466,14 +464,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       task->Wait();
     } else {
       if (comm_ctx) {
-        comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream);
+        comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
             sum_exp_logits_buff,
             sum_exp_logits_buff,
             sum_exp_logits.numel(),
             phi::ToNCCLDataType(sum_exp_logits.dtype()),
-            mcclSum,
+            ncclSum,
             comm->comm(),
             stream));
       }
@@ -514,7 +512,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
                                                    class_interval.data<int>());
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   if (nranks > 1) {
     if (pg) {
       std::vector<phi::DenseTensor> in_tensor;
@@ -528,14 +526,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       task->Wait();
     } else {
       if (comm_ctx) {
-        comm_ctx->AllReduce(loss, *loss, mcclSum, stream);
+        comm_ctx->AllReduce(loss, *loss, ncclSum, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::mcclAllReduce(loss_ptr,
+            phi::dynload::ncclAllReduce(loss_ptr,
                                         loss_ptr,
                                         loss->numel(),
                                         phi::ToNCCLDataType(loss->dtype()),
-                                        mcclSum,
+                                        ncclSum,
                                         comm->comm(),
                                         stream));
       }
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index d1e0a772f3eaa6..76e27380b90e21 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -20,12 +20,6 @@ limitations under the License. */
 
 #include <cub/cub.cuh>  // NOLINT
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 
@@ -53,7 +47,7 @@ struct CUDATypeTraits<float> {
   typedef float TYPE;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // This functor involves a fusion calculation in Ernie or Bert.
 // The fusion mode is as follows:
 //
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index 2b0d3432720dfa..857d870847ee8c 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -28,7 +28,7 @@ struct GRUUnitFunctor<phi::CPUContext, T> {
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
@@ -92,7 +92,7 @@ struct GRUUnitGradFunctor<phi::CPUContext, T> {
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(),
                                 value,
                                 grad,
@@ -182,7 +182,7 @@ struct GRUUnitFunctorV2<phi::CPUContext, T> {
                       int batch_size,
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(CblasNoTrans,
@@ -234,7 +234,7 @@ struct GRUUnitGradFunctorV2<phi::CPUContext, T> {
                       int batch_size,
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     // calculate grad_update_gate, grad_frame_state,
     // grad_reset_output, grad_reset_gate
     detail::cpu_gru_backward(context,
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 792a08423be0ac..3032b78a2029d0 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index 1762353abaa9f2..00ff1fbcbc38db 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 class PreluChannelWiseDirectCUDAFunctor {
  public:
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index 87fe1ee33f0f15..bf028c4ada3695 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -160,11 +160,6 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
                                        s_data,
                                        sizeof(int64_t) * num_samples,
                                        hipMemcpyHostToDevice));
-#elif defined(PADDLE_WITH_MUSA)   
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true,
-                                       s_data,
-                                       sizeof(int64_t) * num_samples,
-                                       musaMemcpyHostToDevice));                                    
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true,
                                         s_data,
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index da8c22aa67bbb3..524ba826a57047 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -106,7 +106,7 @@ class SampleWithProb {
   }
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 class GPUSampleWithProb {
  public:
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index f082189fa0f370..895a427bae6e20 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -98,7 +98,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) {
 
   int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
   head_number = context.Attr<int>("head_number");
 #endif
 
@@ -112,7 +112,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) {
     }
   }
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
   bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_);
 
   if (head_number > 1) {
@@ -271,7 +271,7 @@ class MatMulGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)  && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
     if (context.HasAttr("head_number")) {
       head_number = context.Attr<int>("head_number");
     }
@@ -403,7 +403,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
     head_number = context.Attr<int>("head_number");
 #endif
 
@@ -645,7 +645,7 @@ class MatMulOp : public framework::OperatorWithKernel {
     }
     int64_t dim_out_y = mat_dim_y.width_;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
     int head_number = context->Attrs().Get<int>("head_number");
     bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_);
     if (context->IsRuntime()) {
@@ -788,7 +788,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra();
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
     AddAttr<int>("head_number", "The number of heads of the matrix")
         .SetDefault(1);
 #endif
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index a4b6e061bfdff0..5f480461d77cdb 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -39,7 +39,7 @@ class MemcpyH2DFunctor {
 
   void operator()(const phi::DenseTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto stream = static_cast<const phi::GPUContext *>(&dev_ctx_)->stream();
 #else
     auto stream = nullptr;
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 935b93d1c3ae31..3ed27460e16b6c 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -68,7 +68,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 580ea2da8721cd..64bc176d971492 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index 2d079c8ef521d6..629b41b4b582b7 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT (WITH_NCCL OR WITH_RCCL OR WITH_MCCL))
+if(NOT (WITH_NCCL OR WITH_RCCL))
   return()
 endif()
 
@@ -16,14 +16,7 @@ if(WITH_ROCM AND NOT WIN32)
     DEPS device_context operator)
 endif()
 
-if(WITH_MUSA AND NOT WIN32)
-  musa_library(
-    nccl_common
-    SRCS nccl_gpu_common.cc
-    DEPS device_context operator)
-endif()
-
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   op_library(nccl_op DEPS nccl_common)
   set(OPERATOR_DEPS
       ${OPERATOR_DEPS} nccl_common
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index 4916d71b2f73a0..9f7d967a84708e 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace platform {
 namespace {
 // TODO(panyx0718): Where to destroy them.
-std::unique_ptr<std::vector<mcclComm_t>> global_comms;
+std::unique_ptr<std::vector<ncclComm_t>> global_comms;
 std::unique_ptr<std::unordered_map<int, int>> comm_id_map;
 bool inited = false;
 size_t last_num_gpus = -1;
@@ -41,21 +41,21 @@ void Communicator::InitAll(const std::vector<int>& gpus) {
   if (global_comms) {
     for (size_t i = 0; i < global_comms->size(); ++i) {
       // FIXME(dzh) : PADDLE_ENFORCE return void
-      dynload::mcclCommDestroy((*global_comms)[i]);
+      dynload::ncclCommDestroy((*global_comms)[i]);
     }
   }
-  global_comms = std::make_unique<std::vector<mcclComm_t>>();
+  global_comms = std::make_unique<std::vector<ncclComm_t>>();
   comm_id_map = std::make_unique<std::unordered_map<int, int>>();
   global_comms->resize(gpus.size());
   for (size_t i = 0; i < gpus.size(); ++i) {
     (*comm_id_map)[gpus[i]] = i;
   }
   PADDLE_ENFORCE_GPU_SUCCESS(
-      dynload::mcclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
+      dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
   inited = true;
 }
 
-const std::vector<mcclComm_t>& Communicator::comms() const {
+const std::vector<ncclComm_t>& Communicator::comms() const {
   std::lock_guard<std::mutex> guard(comm_mu);
   return *global_comms;
 }
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
index 0427180d56c04f..01905d8ca84b3b 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -25,8 +25,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
-#elif defined(PADDLE_WITH_MCCL)
-#include "paddle/fluid/platform/dynload/mccl.h"
 #else
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
@@ -44,7 +42,7 @@ struct Communicator {
 
   void InitAll(const std::vector<int>& gpus);
 
-  const std::vector<mcclComm_t>& comms() const;
+  const std::vector<ncclComm_t>& comms() const;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index 7e9b2b1d4dd19f..8b06aa653c070f 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -105,8 +105,8 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
 
     std::string reduction = ctx->Attrs().Get<std::string>("reduction");
     PADDLE_ENFORCE_EQ(
-        (reduction == "mcclSum" || reduction == "mcclProd" ||
-         reduction == "mcclMin" || reduction == "mcclMax"),
+        (reduction == "ncclSum" || reduction == "ncclProd" ||
+         reduction == "ncclMin" || reduction == "ncclMax"),
         true,
         platform::errors::InvalidArgument("invalid nccl reduction."));
 
@@ -124,9 +124,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
     AddAttr<std::string>("reduction",
-                         "(string, default 'mcclSum') "
-                         "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.")
-        .SetDefault("mcclSum");
+                         "(string, default 'ncclSum') "
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
     AddComment(R"DOC(
 NCCLAllReduce Operator.
 
@@ -151,8 +151,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
 
     std::string reduction = ctx->Attrs().Get<std::string>("reduction");
     PADDLE_ENFORCE_EQ(
-        (reduction == "mcclSum" || reduction == "mcclProd" ||
-         reduction == "mcclMin" || reduction == "mcclMax"),
+        (reduction == "ncclSum" || reduction == "ncclProd" ||
+         reduction == "ncclMin" || reduction == "ncclMax"),
         true,
         platform::errors::InvalidArgument("invalid nccl reduction."));
 
@@ -170,9 +170,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
     AddAttr<std::string>("reduction",
-                         "(string, default 'mcclSum') "
-                         "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.")
-        .SetDefault("mcclSum");
+                         "(string, default 'ncclSum') "
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
     AddAttr<int>("root",
                  "(int, default kInvalidGPUId) "
                  "Root gpu of the parameter. If not, "
@@ -246,10 +246,10 @@ REGISTER_OPERATOR(
     ops::NCCLInitOpVarTypeInference,
     ops::NCCLInitOpShapeInference);
 
-REGISTER_OP_WITHOUT_GRADIENT(mcclAllReduce,
+REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce,
                              ops::NCCLAllReduceOp,
                              ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(mcclBcast,
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcast,
                              ops::NCCLBcastOp,
                              ops::NCCLBcastOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclReduce,
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index 7b99c47cf13c88..abb24cc8cae10d 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -27,33 +27,33 @@ class NCCLTypeWrapper;
 template <>
 class NCCLTypeWrapper<float> {
  public:
-  static const mcclDataType_t type = mcclFloat;
+  static const ncclDataType_t type = ncclFloat;
 };
 
 template <>
 class NCCLTypeWrapper<double> {
  public:
-  static const mcclDataType_t type = mcclDouble;
+  static const ncclDataType_t type = ncclDouble;
 };
 
-static mcclRedOp_t str_to_nccl_red_type(std::string reduction) {
-  static const std::unordered_map<std::string, mcclRedOp_t> str_to_type = {
-      {"mcclSum", mcclSum},
-      {"mcclMin", mcclMin},
-      {"mcclMax", mcclMax},
-      {"mcclProd", mcclProd},
+static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
+  static const std::unordered_map<std::string, ncclRedOp_t> str_to_type = {
+      {"ncclSum", ncclSum},
+      {"ncclMin", ncclMin},
+      {"ncclMax", ncclMax},
+      {"ncclProd", ncclProd},
   };
   auto it = str_to_type.find(reduction);
   PADDLE_ENFORCE_EQ(it != str_to_type.end(),
                     true,
                     platform::errors::InvalidArgument(
-                        "Invalid nccl reduction. Must be mcclMin | mcclMax | "
-                        "mcclProd | mcclSum"));
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
   return it->second;
 }
 
 template <typename T, typename DeviceContext>
-class mcclAllReduceKernel : public framework::OpKernel<T> {
+class NCCLAllReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
@@ -74,7 +74,7 @@ class mcclAllReduceKernel : public framework::OpKernel<T> {
             << " invoke allreduce. send " << x->numel() << " recv "
             << out->numel();
     PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::mcclAllReduce(x->data<T>(),
+        platform::dynload::ncclAllReduce(x->data<T>(),
                                          out->mutable_data<T>(ctx.GetPlace()),
                                          out->numel(),
                                          NCCLTypeWrapper<T>::type,
@@ -115,7 +115,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
             << " recv " << out->numel();
     PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::mcclReduce(x->data<T>(),
+        platform::dynload::ncclReduce(x->data<T>(),
                                       recvbuffer,
                                       x->numel(),
                                       NCCLTypeWrapper<T>::type,
@@ -144,7 +144,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     if (idx == root) {
       auto* x = ctx.Input<phi::DenseTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
           x->numel(),
           NCCLTypeWrapper<T>::type,
@@ -157,7 +157,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
               << common::product(out->dims());
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::mcclBcast(out->mutable_data<T>(ctx.GetPlace()),
+          platform::dynload::ncclBcast(out->mutable_data<T>(ctx.GetPlace()),
                                        out->numel(),
                                        NCCLTypeWrapper<T>::type,
                                        root,
@@ -173,8 +173,8 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 PD_REGISTER_STRUCT_KERNEL(
-    mcclAllReduce, GPU, ALL_LAYOUT, ops::mcclAllReduceKernel, float) {}
+    ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
-    mcclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
+    ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 8290da165800b5..6b0a36fc564721 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -30,13 +30,13 @@
 #include "paddle/phi/kernels/funcs/tensor_to_string.h"
 #include "paddle/utils/optional.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #include "math.h"  // NOLINT
 #endif
@@ -74,8 +74,6 @@ static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
   static_assert(!std::is_same<T, void>::value, "T cannot be void.");
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
 #endif
@@ -273,10 +271,6 @@ static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
       &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
-      &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream));
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
       &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream));
@@ -901,14 +895,14 @@ static void MultiTensorUpdateLambParamAndBetaPows(
 #undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static bool CreatePreMulScaleOpIfSupported(
-    mcclDataType_t dtype,
-    mcclComm_t comm,
+    ncclDataType_t dtype,
+    ncclComm_t comm,
     const void *scale,
-    mcclRedOp_t *op,
+    ncclRedOp_t *op,
     distributed::NCCLCommContext *comm_ctx = nullptr) {
-// #if NCCL_VERSION_CODE >= 21100
+#if NCCL_VERSION_CODE >= 21100
   if (FLAGS_dynamic_static_unified_comm) {
     PADDLE_ENFORCE_NOT_NULL(
         comm_ctx,
@@ -919,32 +913,32 @@ static bool CreatePreMulScaleOpIfSupported(
             "But parameter of comm_ctx should not be nullptr."));
     int ver = comm_ctx->GetNcclVersion();
     if (ver >= 21100) {
-      VLOG(10) << "mcclRedOpCreatePreMulSum is supported.";
+      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
       comm_ctx->RedOpCreatePreMulSum(
-          op, const_cast<void *>(scale), dtype, mcclScalarDevice);
+          op, const_cast<void *>(scale), dtype, ncclScalarDevice);
       return true;
     }
   } else {
     int ver;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetVersion(&ver));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver));
     if (ver >= 21100) {
-      VLOG(10) << "mcclRedOpCreatePreMulSum is supported.";
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum(
-          op, const_cast<void *>(scale), dtype, mcclScalarDevice, comm));
+      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
+          op, const_cast<void *>(scale), dtype, ncclScalarDevice, comm));
       return true;
     }
   }
-// #endif
-  VLOG(10) << "mcclRedOpCreatePreMulSum is not supported.";
+#endif
+  VLOG(10) << "ncclRedOpCreatePreMulSum is not supported.";
   return false;
 }
 
 static void DestoryOpIfSupported(
-    mcclRedOp_t op,
-    mcclComm_t comm,
+    ncclRedOp_t op,
+    ncclComm_t comm,
     distributed::NCCLCommContext *comm_ctx = nullptr) {
-// #if NCCL_VERSION_CODE >= 21100
-  VLOG(10) << "mcclRedOpDestroy starts";
+#if NCCL_VERSION_CODE >= 21100
+  VLOG(10) << "ncclRedOpDestroy starts";
 
   if (FLAGS_dynamic_static_unified_comm) {
     PADDLE_ENFORCE_NOT_NULL(
@@ -956,12 +950,12 @@ static void DestoryOpIfSupported(
             "But parameter of comm_ctx should not be nullptr."));
     comm_ctx->RedOpDestroy(op);
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, comm));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm));
   }
-  VLOG(10) << "mcclRedOpDestroy ends";
+  VLOG(10) << "ncclRedOpDestroy ends";
 
-// #endif
-  VLOG(10) << "mcclRedOpDestroy is not supported.";
+#endif
+  VLOG(10) << "ncclRedOpDestroy is not supported.";
 }
 
 template <typename T1, typename T2>
@@ -986,11 +980,11 @@ static void LaunchScaleKernel(const phi::GPUContext &dev_ctx,
 }
 
 template <typename T, bool UseReduceScatter>
-static void mcclSumWithScaleBase(const T *sendbuff,
+static void NCCLSumWithScaleBase(const T *sendbuff,
                                  T *recvbuff,
                                  size_t recvcount,
                                  size_t nranks,
-                                 mcclComm_t comm,
+                                 ncclComm_t comm,
                                  gpuStream_t stream,
                                  const phi::GPUContext &dev_ctx,
                                  distributed::NCCLCommContext *comm_ctx,
@@ -1022,9 +1016,9 @@ static void mcclSumWithScaleBase(const T *sendbuff,
     return;
   }
 
-  mcclRedOp_t op = mcclSum;
-  mcclDataType_t dtype =
-      std::is_same<T, float>::value ? mcclFloat32 : mcclFloat16;
+  ncclRedOp_t op = ncclSum;
+  ncclDataType_t dtype =
+      std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16;
   bool should_destroy_op = scale && CreatePreMulScaleOpIfSupported(
                                         dtype, comm, scale, &op, comm_ctx);
   memory_utils::Buffer buffer(dev_ctx.GetPlace());
@@ -1040,7 +1034,7 @@ static void mcclSumWithScaleBase(const T *sendbuff,
       // TODO(BeingGod): NCCLCommContext::ReduceScatter only accept DenseTensor,
       // but sendbuff or recvbuff maybe allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mcclReduceScatter(sendbuff,
+          phi::dynload::ncclReduceScatter(sendbuff,
                                           recvbuff,
                                           recvcount,
                                           dtype,
@@ -1051,7 +1045,7 @@ static void mcclSumWithScaleBase(const T *sendbuff,
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but sendbuff or recvbuff maybe allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mcclAllReduce(sendbuff,
+          phi::dynload::ncclAllReduce(sendbuff,
                                       recvbuff,
                                       recvcount,
                                       dtype,
@@ -1061,10 +1055,10 @@ static void mcclSumWithScaleBase(const T *sendbuff,
     }
   } else {
     if (UseReduceScatter) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduceScatter(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter(
           sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
           sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
     }
   }
@@ -1075,16 +1069,16 @@ static void mcclSumWithScaleBase(const T *sendbuff,
 }
 
 template <typename T>
-static void mcclReduceScatterWithScale(const T *sendbuff,
+static void NCCLReduceScatterWithScale(const T *sendbuff,
                                        T *recvbuff,
                                        size_t recvcount,
                                        size_t nranks,
-                                       mcclComm_t comm,
+                                       ncclComm_t comm,
                                        gpuStream_t stream,
                                        const phi::GPUContext &dev_ctx,
                                        distributed::NCCLCommContext *comm_ctx,
                                        const T *scale = nullptr) {
-  mcclSumWithScaleBase<T, true>(sendbuff,
+  NCCLSumWithScaleBase<T, true>(sendbuff,
                                 recvbuff,
                                 recvcount,
                                 nranks,
@@ -1096,16 +1090,16 @@ static void mcclReduceScatterWithScale(const T *sendbuff,
 }
 
 template <typename T>
-static void mcclAllReduceWithScale(const T *sendbuff,
+static void NCCLAllReduceWithScale(const T *sendbuff,
                                    T *recvbuff,
                                    size_t recvcount,
                                    size_t nranks,
-                                   mcclComm_t comm,
+                                   ncclComm_t comm,
                                    gpuStream_t stream,
                                    const phi::GPUContext &dev_ctx,
                                    distributed::NCCLCommContext *comm_ctx,
                                    const T *scale = nullptr) {
-  mcclSumWithScaleBase<T, false>(sendbuff,
+  NCCLSumWithScaleBase<T, false>(sendbuff,
                                  recvbuff,
                                  recvcount,
                                  nranks,
@@ -1246,10 +1240,6 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) {
     PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
-        &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream));
@@ -1306,12 +1296,6 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
                                             sizeof(flag),
                                             hipMemcpyDeviceToHost,
                                             dev_ctx.stream()));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag,
-                                             out.Get<bool>(),
-                                             sizeof(flag),
-                                             musaMemcpyDeviceToHost,
-                                             dev_ctx.stream()));                                            
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag,
                                              out.Get<bool>(),
@@ -1474,7 +1458,7 @@ void DistributedFusedLambKernel(
     DenseTensor *acc_step,
     DenseTensor *stop_update,
     DenseTensor *step) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto stream = dev_ctx.stream();
   auto place = dev_ctx.GetPlace();
   found_inf->Resize({1});
@@ -1772,7 +1756,7 @@ void DistributedFusedLambKernel(
 
   // Step 6: allreduce + global norm gradient clip
   int64_t global_rank = 0, local_rank = 0;
-  mcclComm_t global_comm = nullptr, local_comm = nullptr,
+  ncclComm_t global_comm = nullptr, local_comm = nullptr,
              external_comm = nullptr;
   paddle::platform::NCCLComm *nccl_comm_handle = nullptr,
                              *local_nccl_comm_handle = nullptr;
@@ -1884,7 +1868,7 @@ void DistributedFusedLambKernel(
       // (1) ReduceScater first
       if (local_shard) {
         if (use_hierarchical_allreduce) {
-          mcclReduceScatterWithScale(
+          NCCLReduceScatterWithScale(
               fp32_grad_data,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -1893,7 +1877,7 @@ void DistributedFusedLambKernel(
               stream,
               dev_ctx,
               local_comm_ctx);
-          mcclAllReduceWithScale(
+          NCCLAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -1903,7 +1887,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               external_comm_ctx);
 
-          mcclReduceScatterWithScale(
+          NCCLReduceScatterWithScale(
               fp16_grad_data,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -1912,7 +1896,7 @@ void DistributedFusedLambKernel(
               stream,
               dev_ctx,
               local_comm_ctx);
-          mcclAllReduceWithScale(
+          NCCLAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -1922,7 +1906,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               external_comm_ctx);
         } else {
-          mcclAllReduceWithScale(fp32_grad_data,
+          NCCLAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
                                  fp32_numel,
                                  nranks,
@@ -1930,7 +1914,7 @@ void DistributedFusedLambKernel(
                                  stream,
                                  dev_ctx,
                                  comm_ctx);
-          mcclAllReduceWithScale(fp16_grad_data,
+          NCCLAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel,
                                  nranks,
@@ -1942,7 +1926,7 @@ void DistributedFusedLambKernel(
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
         fp16_sum_grad += (local_rank * fp16_numel_each_device);
       } else {
-        mcclReduceScatterWithScale(fp32_grad_data,
+        NCCLReduceScatterWithScale(fp32_grad_data,
                                    fp32_sum_grad,
                                    fp32_numel_each_device,
                                    nranks,
@@ -1950,7 +1934,7 @@ void DistributedFusedLambKernel(
                                    stream,
                                    dev_ctx,
                                    comm_ctx);
-        mcclReduceScatterWithScale(fp16_grad_data,
+        NCCLReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
                                    fp16_numel_each_device,
                                    nranks,
@@ -1973,11 +1957,11 @@ void DistributedFusedLambKernel(
         // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
         // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::mcclAllReduce(fp32_square_grad_norm,
+            phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
                                         1,
-                                        mcclFloat32,
-                                        mcclSum,
+                                        ncclFloat32,
+                                        ncclSum,
                                         local_comm,
                                         stream));
       }
@@ -2030,7 +2014,7 @@ void DistributedFusedLambKernel(
               << HasNanInf(dev_ctx, fp16_grad_data, fp16_numel);
       if (local_shard) {
         if (use_hierarchical_allreduce) {
-          mcclReduceScatterWithScale(
+          NCCLReduceScatterWithScale(
               fp32_grad_data,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -2040,7 +2024,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               local_comm_ctx,
               fp32_scale);
-          mcclAllReduceWithScale(
+          NCCLAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -2049,7 +2033,7 @@ void DistributedFusedLambKernel(
               stream,
               dev_ctx,
               external_comm_ctx);
-          mcclReduceScatterWithScale(
+          NCCLReduceScatterWithScale(
               fp16_grad_data,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -2059,7 +2043,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               local_comm_ctx,
               fp16_scale);
-          mcclAllReduceWithScale(
+          NCCLAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -2069,7 +2053,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               external_comm_ctx);
         } else {
-          mcclAllReduceWithScale(fp32_grad_data,
+          NCCLAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
                                  fp32_numel,
                                  nranks,
@@ -2078,7 +2062,7 @@ void DistributedFusedLambKernel(
                                  dev_ctx,
                                  comm_ctx,
                                  fp32_scale);
-          mcclAllReduceWithScale(fp16_grad_data,
+          NCCLAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel,
                                  nranks,
@@ -2091,7 +2075,7 @@ void DistributedFusedLambKernel(
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
         fp16_sum_grad += (local_rank * fp16_numel_each_device);
       } else {
-        mcclReduceScatterWithScale(fp32_grad_data,
+        NCCLReduceScatterWithScale(fp32_grad_data,
                                    fp32_sum_grad,
                                    fp32_numel_each_device,
                                    nranks,
@@ -2100,7 +2084,7 @@ void DistributedFusedLambKernel(
                                    dev_ctx,
                                    comm_ctx,
                                    fp32_scale);
-        mcclReduceScatterWithScale(fp16_grad_data,
+        NCCLReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
                                    fp16_numel_each_device,
                                    nranks,
@@ -2125,11 +2109,11 @@ void DistributedFusedLambKernel(
         // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
         // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::mcclAllReduce(fp32_square_grad_norm,
+            phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
                                         1,
-                                        mcclFloat32,
-                                        mcclSum,
+                                        ncclFloat32,
+                                        ncclSum,
                                         local_comm,
                                         stream));
         VLOG(1) << "Grad square norm after all reduce: "
@@ -2142,7 +2126,7 @@ void DistributedFusedLambKernel(
   } else {
     if (local_shard) {
       if (use_hierarchical_allreduce) {
-        mcclReduceScatterWithScale(
+        NCCLReduceScatterWithScale(
             fp32_grad_data,
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_numel_each_device,
@@ -2151,7 +2135,7 @@ void DistributedFusedLambKernel(
             stream,
             dev_ctx,
             local_comm_ctx);
-        mcclAllReduceWithScale(
+        NCCLAllReduceWithScale(
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_numel_each_device,
@@ -2160,7 +2144,7 @@ void DistributedFusedLambKernel(
             stream,
             dev_ctx,
             external_comm_ctx);
-        mcclReduceScatterWithScale(
+        NCCLReduceScatterWithScale(
             fp16_grad_data,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_numel_each_device,
@@ -2169,7 +2153,7 @@ void DistributedFusedLambKernel(
             stream,
             dev_ctx,
             local_comm_ctx);
-        mcclAllReduceWithScale(
+        NCCLAllReduceWithScale(
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_numel_each_device,
@@ -2179,7 +2163,7 @@ void DistributedFusedLambKernel(
             dev_ctx,
             external_comm_ctx);
       } else {
-        mcclAllReduceWithScale(fp32_grad_data,
+        NCCLAllReduceWithScale(fp32_grad_data,
                                fp32_sum_grad,
                                fp32_numel,
                                nranks,
@@ -2187,7 +2171,7 @@ void DistributedFusedLambKernel(
                                stream,
                                dev_ctx,
                                comm_ctx);
-        mcclAllReduceWithScale(fp16_grad_data,
+        NCCLAllReduceWithScale(fp16_grad_data,
                                fp16_sum_grad,
                                fp16_numel,
                                nranks,
@@ -2199,7 +2183,7 @@ void DistributedFusedLambKernel(
       fp32_sum_grad += (local_rank * fp32_numel_each_device);
       fp16_sum_grad += (local_rank * fp16_numel_each_device);
     } else {
-      mcclReduceScatterWithScale(fp32_grad_data,
+      NCCLReduceScatterWithScale(fp32_grad_data,
                                  fp32_sum_grad,
                                  fp32_numel_each_device,
                                  num_devices,
@@ -2207,7 +2191,7 @@ void DistributedFusedLambKernel(
                                  stream,
                                  dev_ctx,
                                  comm_ctx);
-      mcclReduceScatterWithScale(fp16_grad_data,
+      NCCLReduceScatterWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel_each_device,
                                  num_devices,
@@ -2227,11 +2211,11 @@ void DistributedFusedLambKernel(
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but fp32_square_grad_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mcclAllReduce(fp32_square_grad_norm,
+          phi::dynload::ncclAllReduce(fp32_square_grad_norm,
                                       fp32_square_grad_norm,
                                       1,
-                                      mcclFloat32,
-                                      mcclSum,
+                                      ncclFloat32,
+                                      ncclSum,
                                       local_comm,
                                       stream));
     }
@@ -2373,26 +2357,26 @@ void DistributedFusedLambKernel(
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but param_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mcclAllReduce(param_square_norm + fp32_global_param_num,
+          phi::dynload::ncclAllReduce(param_square_norm + fp32_global_param_num,
                                       param_square_norm + fp32_global_param_num,
                                       2 * param_num - fp32_global_param_num,
-                                      mcclFloat32,
-                                      mcclSum,
+                                      ncclFloat32,
+                                      ncclSum,
                                       local_comm,
                                       stream));
     } else {
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but trust_ratio_div_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mcclAllReduce(trust_ratio_div_square_norm,
+          phi::dynload::ncclAllReduce(trust_ratio_div_square_norm,
                                       trust_ratio_div_square_norm,
                                       param_num,
-                                      mcclFloat32,
-                                      mcclSum,
+                                      ncclFloat32,
+                                      ncclSum,
                                       local_comm,
                                       stream));
     }
-    VLOG(10) << "mcclAllReduce done";
+    VLOG(10) << "ncclAllReduce done";
   }
 
   LogParamAndTrustRatioDivSquareNorm<1>(
@@ -2417,7 +2401,7 @@ void DistributedFusedLambKernel(
         beta1,
         beta2);
     if (num_devices > 1) {
-      // mcclAllGather
+      // ncclAllGather
       if (local_comm_ctx) {
         auto send_buf = distributed::GetPartialTensor(
             *fp32_param_out, fp32_offset, fp32_numel_each_device);
@@ -2426,10 +2410,10 @@ void DistributedFusedLambKernel(
         local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::mcclAllGather(fp32_param_data + fp32_offset,
+            phi::dynload::ncclAllGather(fp32_param_data + fp32_offset,
                                         fp32_param_data,
                                         fp32_numel_each_device,
-                                        mcclFloat32,
+                                        ncclFloat32,
                                         local_comm,
                                         stream));
       }
@@ -2455,7 +2439,7 @@ void DistributedFusedLambKernel(
         beta1,
         beta2);
     if (num_devices > 1) {
-      // mcclAllGather
+      // ncclAllGather
       if (local_comm_ctx) {
         auto send_buf = distributed::GetPartialTensor(
             *fp16_param_out, fp16_offset, fp16_numel_each_device);
@@ -2464,10 +2448,10 @@ void DistributedFusedLambKernel(
         local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::mcclAllGather(fp16_param_data + fp16_offset,
+            phi::dynload::ncclAllGather(fp16_param_data + fp16_offset,
                                         fp16_param_data,
                                         fp16_numel_each_device,
-                                        mcclFloat16,
+                                        ncclFloat16,
                                         local_comm,
                                         stream));
       }
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index 13d925bbe19a19..4c47fd2b621784 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -25,8 +25,7 @@
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/common/amp_type_traits.h"
 
-#if defined(__NVCC__) || defined(__MUSACC__)
-
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -462,7 +461,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
         grad_index.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
       auto sort_value_ptr =
           sort_value.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index cc11601be0be61..4f118565396e11 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -107,7 +107,7 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           double,
                           int,
                           int64_t) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index b9f05d663dba08..ebdddfd41b33f5 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 24457c24a54ace..b73ffe4319be78 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -48,7 +48,7 @@ BufferedReader::BufferedReader(
       buffer_size_(buffer_size),
       pin_memory_(pin_memory) {
   VLOG(1) << "BufferedReader";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place_) && !pin_memory) {
     int dev_idx = place_.device;  // NOLINT
     compute_stream_ =
@@ -118,7 +118,7 @@ void BufferedReader::ReadAsync(size_t i) {
       return -1UL;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)  // @{ Group GPU Place
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  // @{ Group GPU Place
     if (platform::is_gpu_place(place_)) {
       TensorVec &cuda = cuda_buffer_[i];
       if (cuda.empty()) {
@@ -197,11 +197,6 @@ void BufferedReader::ReadAsync(size_t i) {
             hipEventRecord(events_[i].get(), compute_stream_));
         PADDLE_ENFORCE_GPU_SUCCESS(
             hipStreamWaitEvent(stream_.get(), events_[i].get(), 0));
-#elif defined(PADDLE_WITH_MUSA)     
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            musaEventRecord(events_[i].get(), compute_stream_));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            musaStreamWaitEvent(stream_.get(), events_[i].get(), 0));       
 #else
         PADDLE_ENFORCE_GPU_SUCCESS(
             cudaEventRecord(events_[i].get(), compute_stream_));
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index db849dc70b5da9..032a74b7e23f14 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -21,7 +21,7 @@
 
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
@@ -80,7 +80,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> xpu_buffer_;
   std::vector<TensorVec> custom_device_buffer_;
   size_t prev_pos_{-1UL};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index d0bde6af204893..e69492501c1173 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
 #endif
@@ -757,7 +757,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
   virtual std::string GetOpType() const = 0;
 };
 
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
 template <typename T,
           template <typename>
           class ReduceBaseOp,
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 1a26271a97f225..30d4fb0cf9ad4c 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -429,7 +429,7 @@ class ReshapeKernel {
                               pt_scalar_shape,
                               out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeInferKernel(static_cast<const phi::GPUContext &>(dev_ctx),
@@ -462,7 +462,7 @@ class ReshapeGradKernel {
       phi::ReshapeGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, d_x);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeGradKernel(
@@ -492,7 +492,7 @@ class ReshapeDoubleGradKernel {
       phi::ReshapeDoubleGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, *dd_x, dd_out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeDoubleGradKernel(
@@ -764,7 +764,7 @@ REGISTER_OPERATOR(reshape2_grad_grad,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer,
                   Reshape2DoubleGradInferShapeFunctor);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 float,
                                 ops::ReshapeKernel,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 14b86627c3825d..f025d278074215 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -121,7 +121,7 @@ PD_REGISTER_KERNEL(save_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(save,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h
index 7e3de57345a4bc..2b7f884f6170c3 100644
--- a/paddle/fluid/operators/select_op_helper.h
+++ b/paddle/fluid/operators/select_op_helper.h
@@ -39,7 +39,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) {
   }
   // when platform::is_gpu_place(mask.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
   framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 13133e54f04152..2236988025cbc3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -136,7 +136,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     const size_t *lod;
     size_t lod_count = x.lod()[0].size();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto xlod = x.lod()[0];
       phi::MixVector<size_t> mixv_xlod(&xlod);
@@ -144,7 +144,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     } else {
 #endif
       lod = x.lod()[0].data();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     }
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index 316f8a55cc8034..01f7bb3e928902 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -26,8 +26,44 @@ template <typename T>
 class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(false,"not support");
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
 
+    auto& lod = x->lod();
+    auto& dims = x->dims();
+
+    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        dims[0],
+        static_cast<int64_t>(lod[level].back()),
+        platform::errors::InvalidArgument(
+            "The first dimension of Input(X) should be equal to the sum of all "
+            "sequences' lengths. But received first dimension of Input(X) is "
+            "%d, the sum of all sequences' lengths is %d.",
+            dims[0],
+            static_cast<int64_t>(lod[level].back())));
+    PADDLE_ENFORCE_EQ(dims[0],
+                      x->numel(),
+                      platform::errors::InvalidArgument(
+                          "The width of each timestep in Input(X) of "
+                          "SequenceSoftmaxOp should be 1."));
+
+    out->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor x_i = x->Slice(start_pos, end_pos);
+      Tensor out_i = out->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i =
+          // common::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
+          common::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      phi::funcs::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
+          ctx.template device_context<phi::GPUContext>(), &x_i, &out_i);
+    }
   }
 };
 
@@ -35,7 +71,36 @@ template <typename T>
 class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(false,"not support");
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+    if (x_grad) {
+      x_grad->set_lod(x->lod());
+    }
+    auto& lod = x->lod();
+    const size_t level = lod.size() - 1;
+
+    x_grad->mutable_data<T>(ctx.GetPlace());  // NOLINT
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+
+      Tensor out_i = out->Slice(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = common::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      phi::funcs::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
+          ctx.template device_context<phi::GPUContext>(),
+          &out_i,
+          &out_grad_i,
+          &x_grad_i);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index a037d0dcf73ccf..12d4f72a91169e 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <string>
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 3262bef2bf5e93..40a7a451a6e21a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -17,10 +17,6 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #endif
 
-#ifdef __MUSACC__
-#include <cub/cub.cuh>
-#endif
-
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 16864b80b5c765..a0aa1f589191ff 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -151,32 +151,26 @@ class SetValueGradMaker : public framework::SingleGradOpMaker<T> {
 
  protected:
   void Apply(GradOpPtr<T> op) const override {
-    if (this->HasInput("ValueTensor")) {
-      op->SetType("set_value_grad");
-
-      op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-      op->SetInput("ValueTensor", this->Input("ValueTensor"));
-      if (this->HasInput("StartsTensorList")) {
-        op->SetInput("StartsTensorList", this->Input("StartsTensorList"));
-      }
-      if (this->HasInput("EndsTensorList")) {
-        op->SetInput("EndsTensorList", this->Input("EndsTensorList"));
-      }
-      if (this->HasInput("StepsTensorList")) {
-        op->SetInput("StepsTensorList", this->Input("StepsTensorList"));
-      }
-
-      op->SetAttrMap(this->Attrs());
-
-      op->SetOutput(framework::GradVarName("ValueTensor"),
-                    this->InputGrad("ValueTensor"));
-      op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
-
-    } else {
-      op->SetType("assign");
-      op->SetInput("X", this->OutputGrad("Out"));
-      op->SetOutput("Out", this->InputGrad("Input"));
+    op->SetType("set_value_grad");
+    op->SetInput("ValueTensor", this->Input("ValueTensor"));
+    op->SetOutput(framework::GradVarName("ValueTensor"),
+                  this->InputGrad("ValueTensor"));
+
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+
+    if (this->HasInput("StartsTensorList")) {
+      op->SetInput("StartsTensorList", this->Input("StartsTensorList"));
+    }
+    if (this->HasInput("EndsTensorList")) {
+      op->SetInput("EndsTensorList", this->Input("EndsTensorList"));
     }
+    if (this->HasInput("StepsTensorList")) {
+      op->SetInput("StepsTensorList", this->Input("StepsTensorList"));
+    }
+
+    op->SetAttrMap(this->Attrs());
+
+    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
   }
 };
 
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index a1e4a328cf439f..6b79d5c35b7838 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -69,7 +69,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index c2911806996ce5..caa31565d4cf3d 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations {
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(common::make_ddim(out_shape));
     if (platform::is_gpu_place(context.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
       // For GPU, there is no need to define XxxInverseFunctor and call
       // ElementwiseComputeEx in two branches.
       ElementwiseComputeEx<SubFunctor<InT>, DeviceContext, InT>(
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
index 21406abff8d9f2..af69594f992cde 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -15,7 +15,6 @@
 #include "paddle/fluid/operators/sync_batch_norm_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/sync_batch_norm_kernel.h"
 
@@ -105,8 +104,8 @@ void SyncBatchNormKernel(const Context& ctx,
           <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-    mcclComm_t comm = static_cast<mcclComm_t>(detail::GetCCLComm(x.place(), 0));
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    ncclComm_t comm = static_cast<ncclComm_t>(detail::GetCCLComm(x.place(), 0));
     if (comm == nullptr) {
       comm = ctx.nccl_comm();
     }
@@ -115,11 +114,11 @@ void SyncBatchNormKernel(const Context& ctx,
       int dtype = phi::ToNCCLDataType(mean_out->dtype());
       // In-place operation
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mcclAllReduce(stats,
+          phi::dynload::ncclAllReduce(stats,
                                       stats,
                                       2 * C + 1,
-                                      static_cast<mcclDataType_t>(dtype),
-                                      mcclSum,
+                                      static_cast<ncclDataType_t>(dtype),
+                                      ncclSum,
                                       comm,
                                       stream));
       VLOG(3) << "Sync result using all reduce";
@@ -237,28 +236,26 @@ void SyncBatchNormCooKernel(const Context& dev_ctx,
                             DenseTensor* saved_mean,
                             DenseTensor* saved_variance,
                             DenseTensor* reserve_space) {
-      PADDLE_ENFORCE(false, "error");
-
-  // EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
-  // phi::SyncBatchNormKernel<T, Context>(dev_ctx,
-  //                                      x.values(),
-  //                                      mean,
-  //                                      variance,
-  //                                      scale,
-  //                                      bias,
-  //                                      is_test,
-  //                                      momentum,
-  //                                      epsilon,
-  //                                      data_layout,
-  //                                      use_global_stats,
-  //                                      trainable_statistics,
-  //                                      y->mutable_values(),
-  //                                      mean_out,
-  //                                      variance_out,
-  //                                      saved_mean,
-  //                                      saved_variance,
-  //                                      reserve_space);
-  // y->SetIndicesDict(x.GetIndicesDict());
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
+  phi::SyncBatchNormKernel<T, Context>(dev_ctx,
+                                       x.values(),
+                                       mean,
+                                       variance,
+                                       scale,
+                                       bias,
+                                       is_test,
+                                       momentum,
+                                       epsilon,
+                                       data_layout,
+                                       use_global_stats,
+                                       trainable_statistics,
+                                       y->mutable_values(),
+                                       mean_out,
+                                       variance_out,
+                                       saved_mean,
+                                       saved_variance,
+                                       reserve_space);
+  y->SetIndicesDict(x.GetIndicesDict());
 }
 
 template <typename T, typename Context>
@@ -280,27 +277,26 @@ void SyncBatchNormCooGradKernel(
     SparseCooTensor* x_grad,
     DenseTensor* scale_grad,
     DenseTensor* bias_grad) {
-      PADDLE_ENFORCE(false, "error");
-  // EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
-  // *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
-  // *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
-  // phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
-  //                                          x.values(),
-  //                                          scale,
-  //                                          bias,
-  //                                          saved_mean,
-  //                                          saved_variance,
-  //                                          reserve_space,
-  //                                          y_grad.values(),
-  //                                          momentum,
-  //                                          epsilon,
-  //                                          data_layout,
-  //                                          is_test,
-  //                                          use_global_stats,
-  //                                          trainable_statistics,
-  //                                          x_grad->mutable_values(),
-  //                                          scale_grad,
-  //                                          bias_grad);
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
+  *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
+  *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
+  phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
+                                           x.values(),
+                                           scale,
+                                           bias,
+                                           saved_mean,
+                                           saved_variance,
+                                           reserve_space,
+                                           y_grad.values(),
+                                           momentum,
+                                           epsilon,
+                                           data_layout,
+                                           is_test,
+                                           use_global_stats,
+                                           trainable_statistics,
+                                           x_grad->mutable_values(),
+                                           scale_grad,
+                                           bias_grad);
 }
 
 }  // namespace sparse
diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h
index 21f1052e03a289..c132a91bb5346c 100644
--- a/paddle/fluid/operators/sync_batch_norm_utils.h
+++ b/paddle/fluid/operators/sync_batch_norm_utils.h
@@ -19,8 +19,7 @@ limitations under the License. */
 #include <cmath>
 #include <string>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
-
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -28,7 +27,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/distributed/collective/process_group.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #include "paddle/common/layout.h"
@@ -571,9 +570,9 @@ void SyncBatchNormGradFunctor(
     }
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   int global_gid = 0;
-  mcclComm_t comm = nullptr;
+  ncclComm_t comm = nullptr;
 
   if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
           global_gid)) {
@@ -589,11 +588,11 @@ void SyncBatchNormGradFunctor(
     int dtype = paddle::platform::ToNCCLDataType(scale.dtype());
     // In-place operation
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mcclAllReduce(stats,
+        phi::dynload::ncclAllReduce(stats,
                                     stats,
                                     2 * C + 1,
-                                    static_cast<mcclDataType_t>(dtype),
-                                    mcclSum,
+                                    static_cast<ncclDataType_t>(dtype),
+                                    ncclSum,
                                     comm,
                                     stream));
     VLOG(3) << "Sync result using all reduce";
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index 63d8614f3c3697..ef6172b6965f22 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -15,8 +15,7 @@ limitations under the License. */
 #pragma once
 #include <cstdio>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
-
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 20fe009e4c0912..458794223dc743 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/random.h>
 
 #include "paddle/phi/core/generator.h"
@@ -113,7 +113,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
   return vec_new_shape;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
 template <typename T>
 struct UniformGenerator {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 1aaafb99cf9696..113ba40ec0cf31 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_DGC)
   set(dgc_deps dgc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 endif()
 
@@ -90,14 +90,8 @@ if(WITH_ROCM)
     SRCS stream_callback_manager.cc
     DEPS simple_threadpool enforce common)
 endif()
-if(WITH_MUSA)
-  musa_library(
-    stream_callback_manager
-    SRCS stream_callback_manager.cc
-    DEPS simple_threadpool enforce common)
-endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
 else()
   set(STREAM_CALLBACK_DEPS)
@@ -144,7 +138,7 @@ cc_library(
   SRCS collective_helper.cc gen_comm_id_helper.cc
   DEPS framework_proto device_context enforce common)
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   target_link_libraries(device_context gpu_resource_pool)
 endif()
 
@@ -242,31 +236,6 @@ if(WITH_ROCM)
     DEPS device_context gpu_info)
 endif()
 
-if(WITH_MUSA)
-  musa_library(
-    device_event_gpu
-    SRCS device_event_gpu.cc
-    DEPS device_event_base)
-  set(DEVICE_EVENT_LIBS
-      device_event_gpu
-      CACHE INTERNAL "device event libs")
-  if(WITH_CUSTOM_DEVICE)
-    musa_test(
-      device_event_test
-      SRCS device_event_test.cc
-      DEPS device_event_gpu device_event_custom_device)
-  else()
-    musa_test(
-      device_event_test
-      SRCS device_event_test.cc
-      DEPS device_event_gpu)
-  endif()
-  musa_test(
-    device_context_test
-    SRCS device_context_test.cu
-    DEPS device_context gpu_info)
-endif()
-
 cc_library(timer SRCS timer.cc)
 cc_test(
   timer_test
@@ -316,18 +285,6 @@ elseif(WITH_ROCM)
          stats
          op_proto_maker
          shape_inference)
-elseif(WITH_MUSA)
-  musa_library(
-    profiler
-    SRCS profiler.cc profiler.cu
-    DEPS phi
-        common
-        gpu_info
-        enforce
-        new_profiler
-        stats
-        op_proto_maker
-        shape_inference)
 elseif(WITH_XPU)
   cc_library(
     profiler
@@ -408,23 +365,8 @@ if(WITH_ROCM)
     DEPS gpu_info)
 endif()
 
-if(WITH_MUSA)
-  musa_test(
-    float16_gpu_test
-    SRCS float16_test.cu
-    DEPS lod_tensor)
-  musa_test(
-    test_limit_gpu_memory
-    SRCS test_limit_gpu_memory.cu
-    DEPS gpu_info phi common)
-  musa_library(
-    cuda_device_guard
-    SRCS cuda_device_guard.cc
-    DEPS gpu_info)
-endif()
-
 if(NOT APPLE AND NOT WIN32)
-  if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+  if(WITH_GPU OR WITH_ROCM)
     cc_test(
       device_code_test
       SRCS device_code_test.cc
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 0c322075018983..4ffcf53b1a5747 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -23,7 +23,7 @@
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class NCCLCommImpl : public NCCLComm {
  public:
   void set_ring_id(int ring_id) { ring_id_ = ring_id; }
@@ -37,8 +37,8 @@ class NCCLCommImpl : public NCCLComm {
 
   int device_id() const override { return dev_ctx_->GetPlace().device; }
 
-  void set_comm(mcclComm_t  comm) { comm_ = comm; }
-  mcclComm_t  comm() const override { return comm_; }
+  void set_comm(ncclComm_t comm) { comm_ = comm; }
+  ncclComm_t comm() const override { return comm_; }
 
   gpuStream_t stream() const override { return dev_ctx_->stream(); }
 
@@ -64,7 +64,7 @@ class NCCLCommImpl : public NCCLComm {
   int ring_id_;
   int nranks_;
   int rank_;
-  mcclComm_t comm_;
+  ncclComm_t comm_;
   std::unique_ptr<phi::GPUContext> dev_ctx_;
 
   // used for comm wait compute, compute_stream-->event-->comm_stream
@@ -80,7 +80,7 @@ NCCLCommContext& NCCLCommContext::Instance() {
 }
 
 NCCLComm* NCCLCommContext::CreateComm(
-    mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) {
+    ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) {
   PADDLE_ENFORCE_NOT_NULL(nccl_id,
                           platform::errors::InvalidArgument(
                               "The nccl unique id should not be null."));
@@ -106,10 +106,10 @@ NCCLComm* NCCLCommContext::CreateComm(
       platform::errors::InvalidArgument(
           "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
 
-  mcclComm_t  comm = nullptr;
+  ncclComm_t comm = nullptr;
   SetDeviceId(dev_id);
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::mcclCommInitRank(&comm, nranks, *nccl_id, rank));
+      platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank));
 
   auto* comm_wrapper = AssignNCCLComm(comm, nranks, rank, dev_id, ring_id);
 
@@ -133,8 +133,8 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
                                         dev_ids.size()));
 
   const int kDevices = dev_ids.size();
-  mcclComm_t comms[kDevices];
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclCommInitAll(
+  ncclComm_t comms[kDevices];
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll(
       comms, dev_ids.size(), dev_ids.data()));
 
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id),
@@ -156,7 +156,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
 
 void NCCLCommContext::CreateNCCLCommMultiTrainer(
     const std::vector<int>& dev_ids,
-    mcclUniqueId* nccl_id,
+    ncclUniqueId* nccl_id,
     int ntrainers,
     int train_id,
     int ring_id) {
@@ -169,22 +169,20 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
   VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices
           << ", ntrainers: " << ntrainers << ", train_id: " << train_id
           << ", rind_id: " << ring_id;
-  mcclComm_t comms[kDevices];
+  ncclComm_t comms[kDevices];
   {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
     for (int i = 0; i < kDevices; i++) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i)); 
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i));
 #endif
-      platform::dynload::mcclCommInitRank(
+      platform::dynload::ncclCommInitRank(
           comms + i, kDevices * ntrainers, *nccl_id, train_id * kDevices + i);
       VLOG(1) << "ncclCommInitRank: " << i;
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
     VLOG(1) << "nccl group end seccessss";
   }
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id),
@@ -210,7 +208,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
 }
 
 NCCLComm* NCCLCommContext::AssignNCCLComm(
-    mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id) {
+    ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) {
   std::unique_ptr<phi::GPUContext> dev_ctx(
       new phi::GPUContext(CUDAPlace(dev_id)));
   dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index d88e6e69fba50b..6636856a0eb6ce 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -28,10 +28,10 @@
 namespace paddle {
 namespace platform {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 // In order to apply hierarchical communication with NCCL, we need
 // a communication ring contains NCCL communicators associated to a global
-// mcclUniqueId. E.g. for a hierarchical case,
+// ncclUniqueId. E.g. for a hierarchical case,
 //
 //    11 - 12   21 - 22
 //     |    |    |    |
@@ -55,7 +55,7 @@ class NCCLComm {
   virtual int nranks() const = 0;
   virtual int rank() const = 0;
   virtual int device_id() const = 0;
-  virtual mcclComm_t comm() const = 0;
+  virtual ncclComm_t comm() const = 0;
   virtual gpuStream_t stream() const = 0;
   virtual gpuEvent_t compute_event() const = 0;
   virtual gpuEvent_t comm_event() const = 0;
@@ -69,12 +69,12 @@ class NCCLCommContext {
   static NCCLCommContext& Instance();
 
   NCCLComm* CreateComm(
-      mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0);
+      ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0);
 
   void CreateAllNCCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
 
   void CreateNCCLCommMultiTrainer(const std::vector<int>& dev_ids,
-                                  mcclUniqueId* nccl_id,
+                                  ncclUniqueId* nccl_id,
                                   int nranks,
                                   int rank,
                                   int ring_id);
@@ -82,7 +82,7 @@ class NCCLCommContext {
   // a latter comm with the same dev_id and the same ring_id
   // will override the former
   NCCLComm* AssignNCCLComm(
-      mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0);
+      ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0);
 
   // retrieve a communicator by the ring id in multiprocessing mode
   NCCLComm* Get(int ring_id) const {
@@ -99,7 +99,7 @@ class NCCLCommContext {
     return comm_map_.at(ring_id).begin()->second.get();
   }
 
-  int GetRingId(mcclComm_t comm) const {
+  int GetRingId(ncclComm_t comm) const {
     for (const auto& pair : comm_map_) {
       for (const auto& p : pair.second) {
         if (p.second.get()->comm() == comm) {
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index b782a45047117b..6f0d86f0a4b176 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(DEV_LIBS custom_device)
 
 # GPU
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   add_subdirectory(gpu)
 endif()
 
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index bcfb316837a302..aa2dba03c90824 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 3176d042b7146d..65c3fb20631675 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -28,18 +28,6 @@ elseif(WITH_ROCM)
     cudnn_desc_test
     SRCS cudnn_desc_test.cc
     DEPS dynload_cuda)
-elseif(WITH_MUSA)
-  # add_subdirectory(musa)
-  musa_library(
-    gpu_info
-    SRCS gpu_info.cc
-    DEPS phi common glog enforce monitor dynload_cuda)
-
-  musa_test(cuda_helper_test SRCS cuda_helper_test.cu)
-  musa_test(
-    cudnn_desc_test
-    SRCS cudnn_desc_test.cc
-    DEPS dynload_cuda)
 endif()
 
 cc_library(
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index f94f5d55b7eeef..878a122a492243 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/fluid/platform/device/gpu/musa/musa_helper.h"
 #else
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
 #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index f82d836e83e770..3a26b73e64b772 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -35,8 +35,6 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/fluid/platform/dynload/mudnn.h"
 #else
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
@@ -217,12 +215,6 @@ class RecordedGpuMallocHelper {
     } else {
       result = hipMalloc(ptr, size);
     }
-#elif defined(PADDLE_WITH_MUSA)
-    if (UNLIKELY(malloc_managed_memory)) {
-      result = musaMallocManaged(ptr, size);
-    } else {
-      result = musaMalloc(ptr, size);
-    }
 #else
     phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
@@ -268,9 +260,6 @@ class RecordedGpuMallocHelper {
 #ifdef PADDLE_WITH_HIP
     auto err = hipFree(ptr);
     if (err != hipErrorDeinitialized) {
-#elif defined(PADDLE_WITH_MUSA)
-    auto err = musaFree(ptr);
-    if (err != musaErrorMusartUnloading) {
 #else
     auto err = cudaFree(ptr);
     VLOG(10) << "[cudaFree] size=" << static_cast<double>(size) / (1 << 20)
@@ -317,8 +306,6 @@ class RecordedGpuMallocHelper {
       CUDADeviceGuard guard(dev_id_);
 #ifdef PADDLE_WITH_HIP
       auto result = hipMemGetInfo(actual_avail, actual_total);
-#elif defined(PADDLE_WITH_MUSA)
-      auto result = musaMemGetInfo(actual_avail, actual_total);
 #else
       auto result = cudaMemGetInfo(actual_avail, actual_total);
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index a2fe54ae4dca4f..b5a00e9257a80e 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 018fee5f7416f8..98c6e379342f25 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -16,12 +16,10 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
-#elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 0fb7e061e3243c..9f2168e1cdb8b0 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -30,9 +30,6 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaStreamCreateWithFlags(&stream, musaStreamNonBlocking));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@@ -44,8 +41,6 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -87,9 +82,6 @@ CudaEventResourcePool::CudaEventResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event, hipEventDisableTiming));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaEventCreateWithFlags(&event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
@@ -101,8 +93,6 @@ CudaEventResourcePool::CudaEventResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
index 17e649b9ac62a8..2ac13e692f7837 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
@@ -14,16 +14,13 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index df8b87ed3a0365..c9afafdef7166c 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -15,19 +15,14 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
 
-#include "paddle/fluid/platform/dynload/mublas.h"
-#include "paddle/fluid/platform/dynload/mudnn.h"
-#include "paddle/phi/backends/gpu/forwards.h"
 #else
 #include <cuda_runtime.h>
 
@@ -39,95 +34,78 @@
 namespace paddle {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
-#elif defined(PADDLE_WITH_MUSA)
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
-  using GPU_TYPE = MUSA_TYPE;
 #else  // CDUA
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
 
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
-
-
-  // DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t);
-  // DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
-  //                      cudnnActivationStruct,
-  //                      miopenActivationDescriptor,
-  //                      mudnnActivationStruct);
-  // DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
-  //                      cudnnActivationMode_t,
-  //                      miopenActivationMode_t,
-  //                      mudnnActivationMode_t);
-  // DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
-  //                      cudnnTensorStruct,
-  //                      miopenTensorDescriptor,
-  //                      mudnnTensorStruct);
-  // DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
-  //                      cudnnTensorFormat_t,
-  //                      miopenTensorFormat_t,
-  //                      mudnnTensorFormat_t);
-  // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
-  //                      cudnnFilterStruct,
-  //                      miopenTensorDescriptor,
-  //                      mudnnFilterStruct);
-  // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
-  //                      cudnnFilterDescriptor_t,
-  //                      miopenTensorDescriptor_t,
-  //                      mudnnFilterDescriptor_t);
-  // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
-  //                      cudnnConvolutionStruct,
-  //                      miopenConvolutionDescriptor,
-  //                      mudnnConvolutionStruct);
-  // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
-  //                      cudnnConvolutionDescriptor_t,
-  //                      miopenConvolutionDescriptor_t,
-  //                      mudnnConvolutionDescriptor_t);
-  // DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
-  //                      cudnnPoolingDescriptor_t,
-  //                      miopenPoolingDescriptor_t,
-  //                      mudnnPoolingDescriptor_t);
-  // DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t,mudnnPoolingMode_t);MUDNN_DNN_ROUTINE_EACH
-  // DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
-  //                      cudnnDropoutDescriptor_t,
-  //                      miopenDropoutDescriptor_t,
-  //                      mudnnDropoutDescriptor_t);
-  DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t);
-
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t);
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
+
+DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+                     cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+                     cudnnActivationMode_t,
+                     miopenActivationMode_t);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+                     cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+                     cudnnTensorFormat_t,
+                     miopenTensorFormat_t);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+                     cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+                     cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+                     cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+                     cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+                     cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+                     cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
-// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t);
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 
 using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = ROCM_CV;
-#elif defined(PADDLE_WITH_MUSA)
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
-  constexpr auto GPU_CV = MUSA_CV;
 #else  // CDUA
 
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory,
-                         musaErrorMemoryAllocation);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
+                         hipErrorOutOfMemory);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/musa/musa_helper.h b/paddle/fluid/platform/device/gpu/musa/musa_helper.h
deleted file mode 100644
index 45ded21129a5ad..00000000000000
--- a/paddle/fluid/platform/device/gpu/musa/musa_helper.h
+++ /dev/null
@@ -1,104 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/fluid/platform/dynload/mublas.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/macros.h"
-
-namespace paddle {
-namespace platform {
-
-/*
- * Summary: Grid stride looping macro in CUDA kernel
- *
- *  [ Why need this macro? ]
- *
- *    The original looping in CUDA kernel is:
- *
- *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- *        i += blockDim.x * gridDim.x)`
- *
- *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
- *    may be large, such as over 1GB, the first iteration is no problem here,
- *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
- *    will greater than INT_MAX and overflow becomes negative value, at
- *    this time, the cycle condition `i < (n)` is still satisfied, so it
- *    will cause illegal access to cuda memory.
- *
- *    Here is a real example in ERINE, it will trigger above error.
- *    The related data are:
- *      - blockIdx.x = 2172938
- *      - blockDim.x = 512
- *      - blockIdx.x * blockDim.x = 1112543864
- *      - INT_MAX = 2147483647
- *
- *    So we polish the for condition as follow, the int64_t __index__ will
- *    prevent overflow in the loop increment.
- *
- * Parameters:
- *    - i: loop index
- *    - num: total element numbers
- *
- * Examples:
- *    template <typename T>
- *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
- *                      const int d, const int remain) {
- *    CUDA_KERNEL_LOOP(index, num) {
- *      int idx_n = index / d;
- *      int idx_remain = index % remain;
- *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
- *      }
- *    }
- *
- */
-
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
-  int64_t __index__ =                                                \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
-  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
-  for (index_type i = __index__; __index__ < (num);                  \
-       __index__ += __stride__, i = __index__)
-
-class CublasHandleHolder {
- public:
-  explicit CublasHandleHolder(musaStream_t stream) {
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasCreate(&handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasSetStream(handle_, stream));
-  }
-
-  const mublasHandle_t& GetCublasHandle() const { return handle_; }
-
-  ~CublasHandleHolder() PADDLE_MAY_THROW {
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasDestroy(handle_));
-  }
-
-  template <typename Callback>
-  inline void Call(Callback&& callback) const {
-    std::lock_guard<std::mutex> guard(mtx_);
-    callback(handle_);
-  }
-
- private:
-  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
-
-  mublasHandle_t handle_;
-  mutable std::mutex mtx_;
-};
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index db5bcbc08c5de6..8afcfc9f2b7005 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include <stdio.h>
 
 #include <memory>
@@ -29,15 +29,9 @@
 #ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
-#ifdef PADDLE_WITH_MCCL
-#include "paddle/fluid/platform/dynload/mccl.h"
-#endif
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
-#ifdef PADDLE_WITH_MCCL
-#include "paddle/fluid/platform/dynload/mccl.h"
-#endif
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/bfloat16.h"
@@ -50,63 +44,63 @@
 namespace paddle {
 namespace platform {
 
-inline mcclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
+inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
   if (type == framework::proto::VarType::FP32) {
-    return mcclFloat;
+    return ncclFloat;
   } else if (type == framework::proto::VarType::FP64) {
-    return mcclFloat;
+    return ncclDouble;
   } else if (type == framework::proto::VarType::INT32) {
-    return mcclInt;
+    return ncclInt;
   } else if (type == framework::proto::VarType::INT64) {
-    return mcclInt64;
+    return ncclInt64;
   } else if (type == framework::proto::VarType::FP16) {
-    return mcclFloat16;
+    return ncclFloat16;
   } else if (type == framework::proto::VarType::INT8) {
-    return mcclInt8;
+    return ncclInt8;
   } else if (type == framework::proto::VarType::UINT8) {
-    return mcclUint8;
+    return ncclUint8;
   } else if (type == framework::proto::VarType::BOOL) {
-    return mcclUint8;
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
-  // } else if (type == framework::proto::VarType::BF16) {
-  //   return mcclBfloat16;
-// #endif
+    return ncclUint8;
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+  } else if (type == framework::proto::VarType::BF16) {
+    return ncclBfloat16;
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
   }
 }
 
-inline mcclDataType_t ToNCCLDataType(phi::DataType type) {
+inline ncclDataType_t ToNCCLDataType(phi::DataType type) {
   if (type == phi::DataType::FLOAT32) {
-    return mcclFloat;
+    return ncclFloat;
   } else if (type == phi::DataType::FLOAT64) {
-    return mcclFloat;
+    return ncclDouble;
   } else if (type == phi::DataType::INT32) {
-    return mcclInt;
+    return ncclInt;
   } else if (type == phi::DataType::INT64) {
-    return mcclInt64;
+    return ncclInt64;
   } else if (type == phi::DataType::FLOAT16) {
-    return mcclFloat16;
+    return ncclFloat16;
   } else if (type == phi::DataType::UINT8) {
-    return mcclUint8;
+    return ncclUint8;
   } else if (type == phi::DataType::INT8) {
-    return mcclInt8;
+    return ncclInt8;
   } else if (type == phi::DataType::BOOL) {
-    return mcclUint8;
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
-  // } else if (type == phi::DataType::BFLOAT16) {
-  //   return mcclBfloat16;
-// #endif
+    return ncclUint8;
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+  } else if (type == phi::DataType::BFLOAT16) {
+    return ncclBfloat16;
+#endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
   }
 }
 
-// NOTE(minqiyang): according to the mcclGroupEnd documentations:
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
-// mcclGroupEnd will wait for all communicators to be initialized, which will
+// ncclGroupEnd will wait for all communicators to be initialized, which will
 // cause blocking problem when a runtime_error was thrown, so try only guard
 // NCCL actions when use it.
 class NCCLGroupGuard {
@@ -118,18 +112,18 @@ class NCCLGroupGuard {
 
   inline NCCLGroupGuard() {
     NCCLMutex().lock();
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
   }
 
   inline ~NCCLGroupGuard() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
     NCCLMutex().unlock();
   }
 };
 
 struct NCCLContext {
   std::unique_ptr<phi::GPUContext> ctx_;
-  mcclComm_t comm_;
+  ncclComm_t comm_;
 
   explicit NCCLContext(int dev_id) : comm_{nullptr} {
     ctx_.reset(new phi::GPUContext(CUDAPlace(dev_id)));
@@ -156,7 +150,7 @@ struct NCCLContext {
   }
 
   gpuStream_t stream() const { return ctx_->stream(); }
-  mcclComm_t comm() const { return comm_; }
+  ncclComm_t comm() const { return comm_; }
 
   int device_id() const { return ctx_->GetPlace().device; }
 };
@@ -166,7 +160,7 @@ struct NCCLContextMap {
   std::vector<int> order_;
 
   explicit NCCLContextMap(const std::vector<platform::Place> &places,
-                          mcclUniqueId *nccl_id = nullptr,
+                          ncclUniqueId *nccl_id = nullptr,
                           size_t num_trainers = 1,
                           size_t trainer_id = 0) {
     PADDLE_ENFORCE_EQ(!places.empty(),
@@ -185,11 +179,11 @@ struct NCCLContextMap {
         platform::errors::Unavailable("NCCL Context Map does not support "
                                       "contain two or more same device."));
 
-    std::unique_ptr<mcclComm_t[]> comms(new mcclComm_t[order_.size()]);
+    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
     // if num_trainers == 1, should create a new nccl id for local comms.
     if (num_trainers == 1 && nccl_id == nullptr) {
       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-      PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitAll(
+      PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
           comms.get(), static_cast<int>(order_.size()), order_.data()));
     } else {
       PADDLE_ENFORCE_NOT_NULL(
@@ -209,7 +203,7 @@ struct NCCLContextMap {
           VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
                   << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
           SetDeviceId(gpu_id);
-          PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitRank(
+          PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
               comms.get() + i, nranks, *nccl_id, rank));
         }
       }
@@ -304,7 +298,7 @@ class NCCLCommunicator {
   }
 
   void InitFlatCtxs(const std::vector<platform::Place> &places,
-                    const std::vector<mcclUniqueId *> &nccl_ids,
+                    const std::vector<ncclUniqueId *> &nccl_ids,
                     size_t trainers_num,
                     size_t trainer_id) {
     if (nccl_ids.size() == 0) {
@@ -336,8 +330,8 @@ class NCCLCommunicator {
   }
 
   void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
-                            const std::vector<mcclUniqueId *> &inter_nccl_ids,
-                            const std::vector<mcclUniqueId *> &exter_nccl_ids,
+                            const std::vector<ncclUniqueId *> &inter_nccl_ids,
+                            const std::vector<ncclUniqueId *> &exter_nccl_ids,
                             size_t trainers_num,
                             size_t trainer_id,
                             size_t inter_trainers_num,
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 786b38239e60ef..c4f40767fd52ce 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/core/expect.h"
 #include "paddle/phi/core/generator.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -53,7 +53,7 @@ DeviceType Place2DeviceType(const platform::Place& place) {
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename DevCtx>
 typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
                         DevCtx*>::type
@@ -86,7 +86,7 @@ inline std::unique_ptr<DeviceContext> CreateDeviceContext(
   DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
   auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
   if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ctx,
@@ -184,7 +184,7 @@ void EmplaceDeviceContexts(
           /*unused*/ stream_priority);
 #endif
     } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       EmplaceDeviceContext<phi::GPUContext>(
           place_to_device_context,
           place,
@@ -221,7 +221,7 @@ void EmplaceDeviceContexts(
           "option."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       EmplaceDeviceContext<CUDAPinnedDeviceContext>(
           place_to_device_context,
           place,
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index b015bb9a3e6259..4a75d3ea97f9ae 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -53,18 +53,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"  // NOLINT
 #endif
 
-
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
-#include "paddle/fluid/platform/dynload/mudnn.h"
-#include "paddle/fluid/platform/dynload/mublas.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"  // NOLINT
-#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
-#include "paddle/fluid/platform/dynload/mccl.h"
-#endif
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"  // NOLINT
-#endif
-
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -148,7 +136,7 @@ namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 #endif
 
@@ -177,7 +165,7 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
   using TYPE = paddle::platform::CUDAPinnedDeviceContext;
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index cb43f00f7fe0fb..402974b89e5c90 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -31,7 +31,7 @@ using ::paddle::platform::kXPU;
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 USE_EVENT(kCUDA);
 USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc
index c23f395e0e36bb..cd2d31f1fbefb7 100644
--- a/paddle/fluid/platform/device_event_base.cc
+++ b/paddle/fluid/platform/device_event_base.cc
@@ -53,14 +53,6 @@ unsigned int GenerateDeviceEventFlag(bool enable_timing,
   return flags;
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-  unsigned int flags =
-      (blocking ? musaEventBlockingSync : musaEventDefault) |
-      (enable_timing ? musaEventDefault : musaEventDisableTiming) |
-      (interprocess ? musaEventInterprocess : musaEventDefault);
-  return flags;
-#endif
-
   return 0;
 }
 
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index bbeb67821e023d..d64b062cda0acc 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 namespace paddle {
 namespace platform {
 struct CUDADeviceEventWrapper {
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 10f582069e6613..29f7b91a171572 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -22,10 +22,6 @@ endif()
 if(WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()
-if(WITH_MUSA)
-  list(APPEND MUSA_SRCS mublas.cc mudnn.cc murand.cc mufft.cc)
-endif()
-
 
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
@@ -43,15 +39,6 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
-  if(WITH_MUSA)
-    list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
-    if(WITH_MCCL)
-      list(APPEND MUSA_SRCS mccl.cc)
-    endif()
-    if(CUPTI_FOUND)
-      list(APPEND MUSA_SRCS mupti.cc)
-    endif()
-  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -75,15 +62,6 @@ if(WITH_ROCM)
     dynload_warpctc
     SRCS warpctc.cc
     DEPS dynamic_loader warpctc phi common)
-elseif(WITH_MUSA)
-  musa_library(
-    dynload_cuda
-    SRCS ${MUSA_SRCS}
-    DEPS dynamic_loader phi common)
-  cc_library(
-    dynload_warpctc
-    SRCS warpctc.cc
-    DEPS dynamic_loader warpctc phi common)
 else()
   nv_library(
     dynload_cuda
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index aebdd715b9e1cc..93a19645a0a34e 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -47,7 +47,6 @@ void* GetMKLRTDsoHandle();
 void* GetROCFFTDsoHandle();
 void* GetCusparseLtDsoHandle();
 void* GetXPTIDsoHandle();
-void* GetMUFFTDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/mccl.cc b/paddle/fluid/platform/dynload/mccl.cc
deleted file mode 100644
index 8497d35e2484d2..00000000000000
--- a/paddle/fluid/platform/dynload/mccl.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/mccl.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-// #if NCCL_VERSION_CODE >= 2212
-MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
-// #endif
-
-// #if NCCL_VERSION_CODE >= 2304
-MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
-// #endif
-
-// #if NCCL_VERSION_CODE >= 2703
-MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
-// #endif
-
-// #if NCCL_VERSION_CODE >= 21100
-MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
-// #endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mccl.h b/paddle/fluid/platform/dynload/mccl.h
deleted file mode 100644
index 0e1eac41691a58..00000000000000
--- a/paddle/fluid/platform/dynload/mccl.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <mccl.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/mccl.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)      \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-MCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast);
-MCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
-MCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
-  __macro(mcclSend);                               \
-  __macro(mcclRecv);
-MCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
-  __macro(mcclRedOpCreatePreMulSum);                \
-  __macro(mcclRedOpDestroy);
-MCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mublas.cc b/paddle/fluid/platform/dynload/mublas.cc
deleted file mode 100644
index 0ca4c6c3dac999..00000000000000
--- a/paddle/fluid/platform/dynload/mublas.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/mublas.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2
-MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
-#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3
-MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
-#endif
-
-#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4
-MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
-#endif
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h
deleted file mode 100644
index 0b7d21a4ecb76f..00000000000000
--- a/paddle/fluid/platform/dynload/mublas.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mublas.h>
-#include <musa.h>
-
-#include <mutex>  // NOLINT
-#include <type_traits>
-
-#include "paddle/phi/backends/dynload/mublas.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load mublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)    \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-
-MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-
-MUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-
-MUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-
-MUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mudnn.cc b/paddle/fluid/platform/dynload/mudnn.cc
deleted file mode 100644
index 8b6ee172e14556..00000000000000
--- a/paddle/fluid/platform/dynload/mudnn.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/mudnn.h"
-
-#include "paddle/phi/backends/dynload/mudnn.h"
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-// MUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-
-bool HasCUDNN() { return phi::dynload::HasCUDNN(); }
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mudnn.h b/paddle/fluid/platform/dynload/mudnn.h
deleted file mode 100644
index f980972538a0e4..00000000000000
--- a/paddle/fluid/platform/dynload/mudnn.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_MUSA
-#include <glog/logging.h>
-#include <mudnn.h>
-#include "paddle/phi/backends/dynload/mudnn.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-using ::musa::dnn::BatchNorm;
-using ::musa::dnn::Convolution;
-using ::musa::dnn::Handle;
-using ::musa::dnn::MemoryHandler;
-using ::musa::dnn::Pooling;
-using ::musa::dnn::Softmax;
-using ::musa::dnn::Tensor;
-
-extern bool HasCUDNN();
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/dynload/mufft.cc b/paddle/fluid/platform/dynload/mufft.cc
deleted file mode 100644
index 1126ab516619c7..00000000000000
--- a/paddle/fluid/platform/dynload/mufft.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/mufft.h"
-
-#include "paddle/phi/backends/dynload/mufft.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
-
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mufft.h b/paddle/fluid/platform/dynload/mufft.h
deleted file mode 100644
index 31452acd9d817f..00000000000000
--- a/paddle/fluid/platform/dynload/mufft.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_MUSA
-#include <mufft.h>
-#include <mufftXt.h>
-#include <glog/logging.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/mufft.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name)     \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-/**
- * include all needed cufft functions in HPPL
- * different cufft version has different interfaces
- **/
-#define MUFFT_FFT_ROUTINE_EACH(__macro)  \
-  __macro(mufftPlan1d);                  \
-  __macro(mufftPlan2d);                  \
-  __macro(mufftPlan3d);                  \
-  __macro(mufftPlanMany);                \
-  __macro(mufftMakePlan1d);              \
-  __macro(mufftMakePlan2d);              \
-  __macro(mufftMakePlan3d);              \
-  __macro(mufftMakePlanMany);            \
-  __macro(mufftEstimate1d);              \
-  __macro(mufftEstimate2d);              \
-  __macro(mufftEstimate3d);              \
-  __macro(mufftEstimateMany);            \
-  __macro(mufftCreate);                  \
-  __macro(mufftGetSize1d);               \
-  __macro(mufftGetSize2d);               \
-  __macro(mufftGetSize3d);               \
-  __macro(mufftGetSizeMany);             \
-  __macro(mufftGetSize);                 \
-  __macro(mufftSetWorkArea);             \
-  __macro(mufftSetAutoAllocation);       \
-  __macro(mufftExecC2C);                 \
-  __macro(mufftExecR2C);                 \
-  __macro(mufftExecC2R);                 \
-  __macro(mufftExecZ2Z);                 \
-  __macro(mufftExecD2Z);                 \
-  __macro(mufftExecZ2D);                 \
-  __macro(mufftSetStream);               \
-  __macro(mufftDestroy);                 \
-  __macro(mufftGetVersion);              \
-  __macro(mufftGetProperty);             \
-  __macro(mufftXtSetGPUs);               \
-  __macro(mufftXtMalloc);                \
-  __macro(mufftXtMemcpy);                \
-  __macro(mufftXtFree);                  \
-  __macro(mufftXtExecDescriptorC2C);     \
-  __macro(mufftXtExecDescriptorR2C);     \
-  __macro(mufftXtExecDescriptorC2R);     \
-  __macro(mufftXtExecDescriptorZ2Z);     \
-  __macro(mufftXtExecDescriptorD2Z);     \
-  __macro(mufftXtExecDescriptorZ2D);     \
-  __macro(mufftXtQueryPlan);             \
-  __macro(mufftXtSetCallback);           \
-  __macro(mufftXtClearCallback);         \
-  __macro(mufftXtMakePlanMany);          \
-  __macro(mufftXtGetSizeMany);           \
-  __macro(mufftXtExec);                  \
-  __macro(mufftXtExecDescriptor);        
-
-MUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP)
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
-#endif
diff --git a/paddle/fluid/platform/dynload/murand.cc b/paddle/fluid/platform/dynload/murand.cc
deleted file mode 100644
index 82b911ead32715..00000000000000
--- a/paddle/fluid/platform/dynload/murand.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/murand.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/murand.h b/paddle/fluid/platform/dynload/murand.h
deleted file mode 100644
index b20a49a7043846..00000000000000
--- a/paddle/fluid/platform/dynload/murand.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <murand.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/murand.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)    \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#define MURAND_RAND_ROUTINE_EACH(__macro)      \
-  __macro(murandCreateGenerator);              \
-  __macro(murandSetStream);                    \
-  __macro(murandSetPseudoRandomGeneratorSeed); \
-  __macro(murandGenerateUniform);              \
-  __macro(murandGenerateUniformDouble);        \
-  __macro(murandGenerateNormal);               \
-  __macro(murandDestroyGenerator);
-
-MURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc
deleted file mode 100644
index 8898bd4dfb654a..00000000000000
--- a/paddle/fluid/platform/dynload/musa_driver.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/musa_driver.h"
-
-#include "paddle/phi/backends/dynload/musa_driver.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUSA_ROUTINE_EACH(DEFINE_WRAP);
-
-bool HasCUDADriver() { return phi::dynload::HasCUDADriver(); }
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h
deleted file mode 100644
index 261841e8e73845..00000000000000
--- a/paddle/fluid/platform/dynload/musa_driver.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <musa.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/musa_driver.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern bool HasCUDADriver();
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)      \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-/**
- * include all needed musa driver functions
- **/
-#define PLATFORM_MUSA_ROUTINE_EACH(__macro)             \
-  __macro(muInit);                                      \
-  __macro(muDriverGetVersion);                          \
-  __macro(muGetErrorString);                            \
-  __macro(muModuleLoadData);                            \
-  __macro(muModuleGetFunction);                         \
-  __macro(muModuleUnload);                              \
-  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
-  __macro(muLaunchKernel);                              \
-  __macro(muCtxCreate);                                 \
-  __macro(muCtxGetCurrent);                             \
-  __macro(muDeviceGetCount);                            \
-  __macro(muDevicePrimaryCtxGetState);                  \
-  __macro(muDeviceGetAttribute);                        \
-  __macro(muDeviceGet)
-
-PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
-
-#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc
deleted file mode 100644
index 4e15dab9c1359d..00000000000000
--- a/paddle/fluid/platform/dynload/musartc.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/musartc.h"
-
-#include "paddle/phi/backends/dynload/musartc.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
-
-bool HasNVRTC() { return phi::dynload::HasNVRTC(); }
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h
deleted file mode 100644
index fca957131ef4ee..00000000000000
--- a/paddle/fluid/platform/dynload/musartc.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/musartc.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-extern bool HasNVRTC();
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)     \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-/**
- * include all needed musartc functions
- **/
-#define MUSARTC_ROUTINE_EACH(__macro) \
-  __macro(mtrtcVersion);              \
-  __macro(mtrtcGetErrorString);       \
-  __macro(mtrtcCompileProgram);       \
-  __macro(mtrtcCreateProgram);        \
-  __macro(mtrtcDestroyProgram);       \
-  __macro(mtrtcGetMUSA);              \
-  __macro(mtrtcGetMUSASize);          \
-  __macro(mtrtcGetProgramLog);        \
-  __macro(mtrtcGetProgramLogSize)
-
-MUSARTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
-
-#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musparse.cc b/paddle/fluid/platform/dynload/musparse.cc
deleted file mode 100644
index 347059362bc8db..00000000000000
--- a/paddle/fluid/platform/dynload/musparse.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/dynload/musparse.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-#ifdef MUSPARSE_ROUTINE_EACH
-MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
-#endif
-
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h
deleted file mode 100644
index 586decb9c55c19..00000000000000
--- a/paddle/fluid/platform/dynload/musparse.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <musa.h>
-#include <musparse.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/musparse.h"
-
-namespace paddle {
-namespace platform {
-namespace dynload {
-
-#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)  \
-  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
-  extern DynLoad__##__name __name
-
-#if defined(PADDLE_WITH_MUSA)
-
-
-MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
-#endif  // PADDLE_WITH_MUSA
-
-#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
-}  // namespace dynload
-}  // namespace platform
-}  // namespace paddle
-
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 2cf04248687f27..7b0ea3bb7f3c1f 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -22,21 +22,21 @@ namespace dynload {
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-// #if NCCL_VERSION_CODE >= 2212
+#if NCCL_VERSION_CODE >= 2212
 NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 2304
+#if NCCL_VERSION_CODE >= 2304
 NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 2703
+#if NCCL_VERSION_CODE >= 2703
 NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 21100
+#if NCCL_VERSION_CODE >= 21100
 NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
-// #endif
+#endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index d2150204b8810a..d9516c9f4de4e8 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -29,18 +29,18 @@ namespace dynload {
 
 #define NCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(mcclGetUniqueId);             \
+  __macro(ncclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommAbort);               \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(mcclAllReduce);               \
-  __macro(mcclBcast);                   \
-  __macro(mcclAllGather);               \
-  __macro(mcclGroupStart);              \
-  __macro(mcclGroupEnd);                \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclCommGetAsyncError);       \
@@ -48,29 +48,29 @@ namespace dynload {
 
 NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 
-// #if NCCL_VERSION_CODE >= 2212
+#if NCCL_VERSION_CODE >= 2212
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
 NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 2304
-#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
+#if NCCL_VERSION_CODE >= 2304
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
 NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 2703
+#if NCCL_VERSION_CODE >= 2703
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
   __macro(ncclSend);                               \
   __macro(ncclRecv);
 NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 21100
+#if NCCL_VERSION_CODE >= 21100
 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
   __macro(ncclRedOpCreatePreMulSum);                \
   __macro(ncclRedOpDestroy);
 NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-// #endif
+#endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc
index 512a8fbafe6f61..62bb6a88af7c0a 100644
--- a/paddle/fluid/platform/dynload/rccl.cc
+++ b/paddle/fluid/platform/dynload/rccl.cc
@@ -22,21 +22,21 @@ namespace dynload {
 
 RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-// #if NCCL_VERSION_CODE >= 2212
+#if NCCL_VERSION_CODE >= 2212
 RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 2304
+#if NCCL_VERSION_CODE >= 2304
 RCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 2703
+#if NCCL_VERSION_CODE >= 2703
 RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
-// #endif
+#endif
 
-// #if NCCL_VERSION_CODE >= 21100
+#if NCCL_VERSION_CODE >= 21100
 RCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
-// #endif
+#endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h
index cba083334ce5c1..4d988e4fb08a08 100644
--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -29,17 +29,17 @@ namespace dynload {
 
 #define RCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(mcclGetUniqueId);             \
+  __macro(ncclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(mcclAllReduce);               \
-  __macro(mcclBcast);                   \
-  __macro(mcclAllGather);               \
-  __macro(mcclGroupStart);              \
-  __macro(mcclGroupEnd);                \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclGetErrorString);
@@ -52,7 +52,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2304
-#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
 RCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 8dab0df5007822..1a82b05f3bc3af 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -38,16 +38,6 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
-#ifdef PADDLE_WITH_MUSA
-#include <mublas.h>
-#include <mudnn.h>
-#include <mufft.h>
-#include <murand.h>
-#include <musparse.h>
-#include <thrust/system/musa/error.h>
-#include <thrust/system_error.h>
-#endif  // PADDLE_WITH_CUDA
-
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
@@ -91,20 +81,6 @@ limitations under the License. */
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
-
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/mublas.h"
-#include "paddle/phi/backends/dynload/mudnn.h"
-#include "paddle/phi/backends/dynload/murand.h"
-// #include "paddle/phi/backends/dynload/musolver.h"
-#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
-#include <error.h>
-
-#include "paddle/phi/backends/dynload/mccl.h"
-#endif  // __APPLE__
-#endif  // PADDLE_WITH_MUSA
-
-
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hipfft.h"
 #include "paddle/phi/backends/dynload/hiprand.h"
@@ -122,7 +98,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/core/enforce.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/phi/core/flags.h"
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index d3148257ea6dea..690580d8f9c5de 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -391,7 +391,7 @@ TEST(enforce, hip_success) {
   EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error"));
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-  EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess));
+  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error"));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error"));
 #endif
@@ -498,7 +498,7 @@ TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error"));
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-  EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess));
+  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error"));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError,
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index 68a7a2e462aa7c..e807a54fdee2d7 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -21,11 +21,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index 6bcf6a368331fa..a77e396adee5f4 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
@@ -500,8 +500,8 @@ SocketServer& SocketServer::GetInstance(const std::string& end_point) {
                                           std::vector<Type>* nccl_ids,      \
                                           int ring_id = 0);
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-INSTANT_TEMPLATE(mcclUniqueId)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+INSTANT_TEMPLATE(ncclUniqueId)
 #endif
 #ifdef PADDLE_WITH_XPU_BKCL
 INSTANT_TEMPLATE(BKCLUniqueId)
diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h
index 0d975d84093cfd..d97b41311995e1 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include <functional>
 #include <memory>
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index c07772e1a1afc6..a3fff528f7903e 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -57,8 +57,8 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 
@@ -169,7 +169,7 @@ void InitDevices() {
 #endif
     /*Init all available devices by default */
     std::vector<int> devices;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     try {
       // use user specified GPUs in single-node multi-process mode.
       devices = platform::GetSelectedDevices();
@@ -209,7 +209,7 @@ void InitDevices(const std::vector<int> devices) {
       continue;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     places.emplace_back(platform::CUDAPlace(device));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -220,7 +220,7 @@ void InitDevices(const std::vector<int> devices) {
 #endif
   }
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   places.emplace_back(platform::CUDAPinnedPlace());
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -431,19 +431,19 @@ void InitMemoryMethod() {
     memory_method->allocation_deleter =
         paddle::memory::allocation::Allocator::AllocationDeleter;
 #if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
-    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+    defined(PADDLE_WITH_HIP)
     memory_method->copy_with_stream =
         paddle::memory::Copy<phi::Place, phi::Place>;
 #endif
     memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
     memory_method->device_memory_stat_current_value =
         paddle::memory::DeviceMemoryStatCurrentValue;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
     // TODO(GhostScreaming): Use phi methods later.
     memory_method->get_allocator =
         [](int device_id, phi::gpuStream_t stream) -> phi::Allocator * {
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index b0bc0a111cdd23..3d215435881cfe 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -58,7 +58,7 @@ typename Visitor::result_type VisitPlace(const Place &place,
                                          const Visitor &visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       platform::CUDAPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -68,7 +68,7 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       platform::CUDAPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 1ed73672f0e3e5..44c17c32fa8d56 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -685,7 +685,7 @@ void EnableProfiler(ProfilerState state) {
   HostTraceLevel::GetInstance().SetLevel(option.trace_level);
   should_send_profile_state = true;
   phi::GetDeviceTracer()->Enable();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
       phi::ProfilerHelper::g_state == ProfilerState::kAll ||
       phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index 84a20f8bf7d3c1..5d1caffd45326d 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -16,11 +16,6 @@ limitations under the License. */
 #include <cuda.h>
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -57,20 +52,6 @@ void DummyKernelAndEvent() {
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr));
     });
   }
-#elif defined(PADDLE_WITH_MUSA)
-  for (int i = 0; i < 5; i++) {
-    ForEachDevice([](int d) {
-      platform::SetDeviceId(d);
-      musaStream_t stream;
-      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream));
-      Mark("_musa_startup_");
-      int *ptr;
-      PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int)));
-      DummyKernel<<<1, 1, 0, stream>>>(ptr);
-      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr));
-    });
-  }
 #else
   for (int i = 0; i < 5; i++) {
     ForEachDevice([](int d) {
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 89c78f01ac4872..4d6bc9cc242d47 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -198,7 +198,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
                    const std::string& type_name);
 void SetTracerOption(TracerOption option);
 platform::TracerOption GetTracerOption();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void DummyKernelAndEvent();
 #endif
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index e67b0fbc3c68db..de8fd01a1e59de 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -552,7 +552,7 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version,
                                        span_indx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void ChromeTracingLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   // add device property information
@@ -664,44 +664,6 @@ void ChromeTracingLogger::LogDeviceProperty(
     device_nums -= 1;
   }
 #endif
-#if defined(PADDLE_WITH_MUSA)
-  for (auto it = device_property_map.begin(); it != device_property_map.end();
-       it++) {
-    const gpuDeviceProp& device_property = it->second;
-    if (device_nums > 1) {
-      output_file_stream_ << string_format(std::string(
-                                               R"JSON(
-    {
-      "id": %u, "name": "%s", "totalGlobalMem": %llu,
-      "computeMajor": %d, "computeMinor": %d,
-      "smCount": %d
-    },
-  )JSON"),
-                                           it->first,
-                                           device_property.name,
-                                           device_property.totalGlobalMem,
-                                           device_property.major,
-                                           device_property.minor,
-                                           device_property.multiProcessorCount);
-    } else {
-      output_file_stream_ << string_format(std::string(
-                                               R"JSON(
-      {
-        "id": %u, "name": "%s", "totalGlobalMem": %llu,
-        "computeMajor": %d, "computeMinor": %d,
-        "smCount": %d
-      }],
-    )JSON"),
-                                           it->first,
-                                           device_property.name,
-                                           device_property.totalGlobalMem,
-                                           device_property.major,
-                                           device_property.minor,
-                                           device_property.multiProcessorCount);
-    }
-    device_nums -= 1;
-  }
-#endif
 }
 #endif
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index e0cf523ea53eea..37323d1450bf2d 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -41,7 +41,7 @@ class ChromeTracingLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index c2020acf35d25a..1fce7edc3e329e 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -129,7 +129,7 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   // restore NodeTrees object
   std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
 // restore gpuDeviceProp
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   for (auto indx = 0; indx < node_trees_proto_->device_property_size();
        indx++) {
@@ -155,7 +155,7 @@ DeserializationReader::~DeserializationReader() {  // NOLINT
   input_file_stream_.close();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
     const DevicePropertyProto& device_property_proto) {
   gpuDeviceProp device_property;
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index c8ac33c5bea49b..5f99f6fd82c55d 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -39,7 +39,7 @@ class DeserializationReader {
   MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
   OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
       const OperatorSupplementEventNodeProto&);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
 #endif
 
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index 9b5b2636db30bb..6f4ed06de9e8ec 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -40,7 +40,7 @@ void SerializationLogger::OpenFile() {
   node_trees_proto_ = new NodeTreesProto();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void SerializationLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   for (const auto& item : device_property_map) {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 67eafdf44e3cd1..80d5413106dedc 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -37,7 +37,7 @@ class SerializationLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index 4ea1b756a458cd..c01b4abcfbbd3d 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -130,7 +130,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
   return host_python_node;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 ProfilerResult::ProfilerResult(
     std::unique_ptr<NodeTrees> tree,
     const ExtraInfo& extra_info,
@@ -170,7 +170,7 @@ void ProfilerResult::Save(const std::string& file_name,
   if (format == std::string("json")) {
     ChromeTracingLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
@@ -178,7 +178,7 @@ void ProfilerResult::Save(const std::string& file_name,
   } else if (format == std::string("pb")) {
     SerializationLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index f1d217674bf6c6..dae32a1902834e 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -138,7 +138,7 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   explicit ProfilerResult(
       std::unique_ptr<NodeTrees> tree,
       const ExtraInfo& extra_info,
@@ -166,7 +166,7 @@ class ProfilerResult {
 
   std::string GetVersion() { return version_; }
   uint32_t GetSpanIndx() { return span_indx_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
     return device_property_map_;
   }
@@ -176,7 +176,7 @@ class ProfilerResult {
   std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
   std::shared_ptr<NodeTrees> tree_;
   ExtraInfo extra_info_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::map<uint32_t, gpuDeviceProp> device_property_map_;
 #endif
   std::string version_;
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 2bb7731b0c1599..bcb35f5b7bd352 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -18,16 +18,10 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -53,9 +47,6 @@ void SynchronizeDevice() {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
-#ifdef PADDLE_WITH_MUSA
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
   for (const auto& dev_type : dev_types) {
@@ -171,7 +162,7 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
                            std::string("%s"),
                            kv.second.c_str());
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   std::vector<int32_t> device_ids = GetSelectedDevices();
   for (auto device_id : device_ids) {
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index 86243e9258dd62..f7f888d9e67396 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -23,9 +23,6 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
@@ -83,11 +80,6 @@ TEST(ProfilerTest, TestCudaTracer) {
   hipStream_t stream;
   hipStreamCreate(&stream);
   hipStreamSynchronize(stream);
-#endif
-#ifdef PADDLE_WITH_MUSA
-  musaStream_t stream;
-  musaStreamCreate(&stream);
-  musaStreamSynchronize(stream);
 #endif
   auto profiler_result = profiler->Stop();
   auto nodetree = profiler_result->GetNodeTrees();
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index e1720874e1489c..9835e7525c51ef 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -34,10 +34,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#include <musa.h>
-#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
@@ -107,17 +103,6 @@ void SynchronizeAllDevice() {
   }
   SetDeviceId(pre_device_id);
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-  int pre_device_id = GetCurrentDeviceId();
-  int count = GetGPUDeviceCount();
-  for (int i = 0; i < count; i++) {
-    SetDeviceId(i);
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
-  }
-  SetDeviceId(pre_device_id);
-#endif
-
 #ifdef PADDLE_WITH_HIP
   int pre_device_id = GetCurrentDeviceId();
   int count = GetGPUDeviceCount();
@@ -156,7 +141,7 @@ void PrintMemProfiler(
             << "    Memory Profiling Report     "
             << "<-------------------------\n\n";
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int num_gpus = GetGPUDeviceCount();
   std::cout.setf(std::ios::left);
   if (num_gpus > 0) {
@@ -358,7 +343,7 @@ void SetEvent(bool merge_thread,
     if (rit != pushed_events->rend()) {
       double event_time = 0;
       double gpu_time = 0.0f;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gpu_time = rit->CudaElapsedMs(analyze_event);
 #endif
       double cpu_time = rit->CpuElapsedMs(analyze_event);
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index 97ca34c0209d39..c55bcb71a7d432 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -24,11 +24,6 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void *user_data)
 #endif
-#ifdef PADDLE_WITH_MUSA
-static void StreamCallbackFunc(gpuStream_t stream,
-                               gpuError_t status,
-                               void *user_data)
-#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     static void CUDART_CB StreamCallbackFunc(void *user_data)
@@ -63,11 +58,6 @@ void StreamCallbackManager<Stream>::AddCallback(
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
-#ifdef PADDLE_WITH_MUSA
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -81,7 +71,7 @@ void StreamCallbackManager<Stream>::AddCallback(
 
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
   platform::GpuStreamSync(stream_);
 #endif
   {
@@ -98,8 +88,5 @@ template struct StreamCallbackManager<gpuStream_t>;
 #ifdef PADDLE_WITH_HIP
 template struct StreamCallbackManager<hipStream_t>;
 #endif
-#ifdef PADDLE_WITH_MUSA
-template struct StreamCallbackManager<musaStream_t>;
-#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 1cc0f0e5cf1e9a..7cd6930a9d0d0f 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -25,11 +25,6 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#include <musa.h>
-#endif
-
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
diff --git a/paddle/fluid/primitive/composite/composite.h b/paddle/fluid/primitive/composite/composite.h
index a35095c98d4a29..66f17168ec01a5 100644
--- a/paddle/fluid/primitive/composite/composite.h
+++ b/paddle/fluid/primitive/composite/composite.h
@@ -22,6 +22,9 @@ namespace paddle {
 namespace primitive {
 namespace details {
 
+// empty_shape means x.shape=[]
+static std::vector<int64_t> empty_shape;
+
 template <typename T>
 Tensor mean_decomp(const Tensor& x, const IntArray& axis, bool keepdim) {
   auto org_dtype = x.dtype();
@@ -345,62 +348,66 @@ std::tuple<Tensor, Tensor, Tensor> layer_norm_decomp(
 
   // cast dtype to float32 if dtype =float16 or bfloat16
   if (need_cast) {
-    x_cast = cast<T>(x_cast, phi::DataType::FLOAT32);
+    x_cast = cast<T>(x_cast, DataType::FLOAT32);
   }
 
   auto x_dim = common::vectorize<int64_t>(x.dims());
   for (size_t i = begin_norm_axis; i < x_dim.size(); i++) {
     axis.push_back(static_cast<int64_t>(i));
   }
-  auto mean_ = mean_decomp<T>(x_cast, IntArray(axis), true);
+  auto mean_ = mean_decomp<T>(x_cast, axis, true);
   auto difference = x_cast - mean_;
   auto var_tmp1 = difference * difference;
-  auto variance = mean_decomp<T>(var_tmp1, IntArray(axis), true);
+  auto variance = mean_decomp<T>(var_tmp1, axis, true);
   auto var_tmp3 = variance + epsilon;
   auto rsqrt_var = elementwise_pow<T>(
-      var_tmp3,
-      full<T>(common::vectorize(var_tmp3.dims()), -0.5, var_tmp3.dtype()));
+      var_tmp3, full<T>(empty_shape, -0.5, var_tmp3.dtype()));
   auto out = difference * rsqrt_var;
 
   auto scale_ptr = scale.get_ptr();
   auto bias_ptr = bias.get_ptr();
 
-  std::vector<int64_t> slice_shape;
-  for (int64_t i = begin_norm_axis; i < static_cast<int64_t>(x_dim.size());
-       i++) {
-    slice_shape.push_back(x_dim[i]);
+  std::vector<int64_t> slice_shape_l;
+  std::vector<int64_t> slice_shape_r;
+  for (int64_t i = 0; i < static_cast<int64_t>(x_dim.size()); i++) {
+    if (i < begin_norm_axis) {
+      slice_shape_l.push_back(x_dim[i]);
+    } else {
+      slice_shape_r.push_back(x_dim[i]);
+    }
   }
   Tensor scale_cast;
   if (scale_ptr) {
-    if (slice_shape != scale_ptr->shape()) {
-      scale_cast = reshape<T>(*scale_ptr, slice_shape);
+    if (slice_shape_r != scale_ptr->shape()) {
+      scale_cast = reshape<T>(*scale_ptr, slice_shape_r);
     } else {
       scale_cast = *scale_ptr;
     }
     if (need_cast) {
-      scale_cast = cast<T>(scale_cast, phi::DataType::FLOAT32);
+      scale_cast = cast<T>(scale_cast, DataType::FLOAT32);
     }
     out = out * scale_cast;
   }
   Tensor bias_cast;
   if (bias_ptr) {
-    if (slice_shape != bias_ptr->shape()) {
-      bias_cast = reshape<T>(*bias_ptr, slice_shape);
+    if (slice_shape_r != bias_ptr->shape()) {
+      bias_cast = reshape<T>(*bias_ptr, slice_shape_r);
     } else {
       bias_cast = *bias_ptr;
     }
     if (need_cast) {
-      bias_cast = cast<T>(bias_cast, phi::DataType::FLOAT32);
+      bias_cast = cast<T>(bias_cast, DataType::FLOAT32);
     }
     out = out + bias_cast;
   }
-  mean_ = reshape<T>(mean_, std::vector<int64_t>({-1}));
-  variance = reshape<T>(variance, std::vector<int64_t>({-1}));
+  mean_ = reshape<T>(mean_, slice_shape_l);
+  variance = reshape<T>(variance, slice_shape_l);
 
+  // same as LayerNormInferMeta
+  // x: float32 --> out: float32, mean: float32, variance: float32
+  // x: float16 --> out: float16, mean: float32, variance: float32
   if (need_cast) {
     out = cast<T>(out, org_dtype);
-    mean_ = cast<T>(mean_, org_dtype);
-    variance = cast<T>(variance, org_dtype);
   }
 
   return std::make_tuple(out, mean_, variance);
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 5306d282e797ca..4f761aa3c8536d 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -67,7 +67,7 @@ if(WITH_RPC)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc ${EXTERNAL_BRPC_DEPS} zlib phi
                   common)
 endif()
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
 endif()
@@ -79,7 +79,7 @@ if(WITH_IPU)
   set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info)
 endif()
 
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
 endif()
@@ -99,7 +99,6 @@ if(WITH_CUSTOM_DEVICE)
   if(NOT
      (WITH_NCCL
       OR WITH_RCCL
-      OR WITH_MCCL
       OR WITH_XPU_BKCL))
     set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
@@ -108,7 +107,7 @@ endif()
 
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
-  if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+  if(WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
   endif()
@@ -163,7 +162,7 @@ endif()
 
 if(WITH_PYTHON)
   set(PYBIND_DEPS ${PYBIND_DEPS} process_group eager_reducer)
-  if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+  if(WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl)
   endif()
   if(WITH_XPU_BKCL)
@@ -247,7 +246,7 @@ if(WITH_RPC)
   set(PYBIND_SRCS rpc.cc ${PYBIND_SRCS})
 endif()
 
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
 endif()
 
@@ -266,7 +265,7 @@ if(WITH_PYTHON)
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
-  if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+  if(WITH_NCCL OR WITH_RCCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
   endif()
 
@@ -287,20 +286,15 @@ if(WITH_PYTHON)
                  eager_legacy_op_function_generator.cc)
   set(GENERATOR_DEPS ${PYBIND_DEPS})
   list(REMOVE_DUPLICATES GENERATOR_DEPS)
-  if(NOT WITH_ARM)
+  if(WIN32)
     list(REMOVE_ITEM GENERATOR_DEPS python)
   endif()
   target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS})
-  # if(NOT WIN32)
-  #   add_executable(kernel_signature_generator kernel_signature_generator.cc)
-  #   if(WITH_MUSA)
-  #     # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/'
-  #     target_link_options(kernel_signature_generator PRIVATE
-  #                         -Wl,-rpath,/usr/lib/x86_64-linux-gnu/)
-  #   endif()    
-  #   target_link_libraries(kernel_signature_generator
-  #                         ${OP_FUNCTION_GENERETOR_DEPS})
-  # endif()
+  if(NOT WIN32)
+    add_executable(kernel_signature_generator kernel_signature_generator.cc)
+    target_link_libraries(kernel_signature_generator
+                          ${OP_FUNCTION_GENERETOR_DEPS})
+  endif()
 
   get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(eager_legacy_op_function_generator
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index 6351d021dfe8cb..391dbabb1a2109 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -48,7 +48,7 @@ void BindCommContextManager(py::module *m) {
           .def_static("set_device_id",
                       &phi::distributed::CommContextManager::SetDeviceId,
                       py::call_guard<py::gil_scoped_release>())
-#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
           .def_static(
               "create_nccl_comm_context",
               &phi::distributed::CommContextManager::CreateNCCLCommContext,
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index a07aef2fb69965..2a6c639735a2b4 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -24,7 +24,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 phi::CUDAStream *get_current_stream(int device_id) {
   if (device_id == -1) {
     device_id = phi::backends::gpu::GetCurrentDeviceId();
@@ -51,7 +51,7 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_get_current_stream",
       [](int deviceId) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         return platform::get_current_stream(deviceId);
 #else
         PADDLE_THROW(
@@ -64,7 +64,7 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_set_current_stream",
       [](phi::CUDAStream *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         return platform::set_current_stream(stream);
 #else
         PADDLE_THROW(
@@ -75,7 +75,7 @@ void BindCudaStream(py::module *m_ptr) {
       py::return_value_policy::reference);
 
   m.def("_device_synchronize", [](int device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (device_id == -1) {
       device_id = paddle::platform::GetCurrentDeviceId();
     }
@@ -84,8 +84,6 @@ void BindCudaStream(py::module *m_ptr) {
     paddle::platform::SetDeviceId(device_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
@@ -116,7 +114,7 @@ void BindCudaStream(py::module *m_ptr) {
               >>> s3 = paddle.device.cuda.Stream()
 
       )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def(
           "wait_event",
           [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) {
@@ -251,7 +249,7 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -277,7 +275,7 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, int device, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -307,7 +305,7 @@ void BindCudaStream(py::module *m_ptr) {
           py::arg("device") = -1,
           py::arg("priority") = 2)
       .def("__init__", [](phi::CUDAStream &self) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         int device_id = platform::GetCurrentDeviceId();
         auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking;
         new (&self) phi::CUDAStream(
@@ -334,7 +332,7 @@ void BindCudaStream(py::module *m_ptr) {
               >>> event = paddle.device.cuda.Event()
 
       )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def(
           "record",
           [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) {
@@ -401,7 +399,7 @@ void BindCudaStream(py::module *m_ptr) {
              bool enable_timing,
              bool blocking,
              bool interprocess) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
             unsigned int flags = platform::GenerateDeviceEventFlag(
                 enable_timing, blocking, interprocess);
             new (&self) paddle::platform::CudaEvent(flags);
diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h
index 61f27960e25e9d..d10608a6e8ea96 100644
--- a/paddle/fluid/pybind/cuda_streams_py.h
+++ b/paddle/fluid/pybind/cuda_streams_py.h
@@ -17,7 +17,7 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/core/cuda_stream.h"
 #else
 namespace phi {
@@ -29,7 +29,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 phi::CUDAStream* get_current_stream(int device_id = -1);
 phi::CUDAStream* set_current_stream(phi::CUDAStream* stream);
 #endif
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index ea61387ae53e51..4577171fd77bb5 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/phi/api/all.h"
 #include "paddle/phi/core/distributed/types.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 
@@ -1224,7 +1224,7 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>());
 
-#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
   py::class_<distributed::ProcessGroupNCCL,
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 098c2fa4bdf778..894ede8db18d2b 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -322,7 +322,7 @@ void InitTensorWithNumpyValue(TensorObject* self,
 #endif
     SetTensorFromPyArray<platform::XPUPlace>(impl_ptr, array, place, zero_copy);
   } else if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     phi::backends::gpu::SetDeviceId(place.device);
     VLOG(4) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
             << " from " << static_cast<int>(place.device);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 956de0e9d371a0..df84ca68b9182b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -58,7 +58,7 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #endif
 
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index e932ecb34201c7..2c01e122914aa4 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -139,7 +139,7 @@ std::set<phi::DataType> _complex_dtypes{
 
 void SetDevice(paddle::platform::Place place) {
   if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     phi::backends::gpu::SetDeviceId(place.device);
     VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
             << " from " << static_cast<int>(place.device);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 48a8fdc8daa700..584d1b8b58482a 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -54,6 +54,7 @@ typedef SSIZE_T ssize_t;
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
 #include "paddle/common/ddim.h"
 #include "paddle/fluid/eager/amp_utils.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/nodes.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
 #include "paddle/fluid/eager/eager_amp_auto_cast.h"
 #include "paddle/fluid/framework/python_headers.h"
@@ -318,13 +319,11 @@ static PyObject* tensor_method_numpy(TensorObject* self,
                            dense_tensor->Holder()->size());
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (self->tensor.is_gpu()) {
     eager_gil_scoped_release guard;
 #if defined(PADDLE_WITH_CUDA)
     gpuMemcpyKind kind = cudaMemcpyDeviceToHost;
-#elif defined(PADDLE_WITH_MUSA)
-    gpuMemcpyKind kind = musaMemcpyDeviceToHost;
 #elif defined(PADDLE_WITH_HIP)
     gpuMemcpyKind kind = hipMemcpyDeviceToHost;
     phi::DeviceContextPool::Instance().Get(self->tensor.place())->Wait();
@@ -1361,6 +1360,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
              &use_strided_slice);
 
   // step2: Dealing with basic indexing
+  bool out_is_view = false;
   auto out = getTensorWithBasicIndexing(tensor,
                                         &slice_axes,
                                         &slice_starts,
@@ -1369,7 +1369,8 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
                                         &decrease_axis,
                                         &none_axes,
                                         &infer_flags,
-                                        &use_strided_slice);
+                                        &use_strided_slice,
+                                        &out_is_view);
 
   if (!has_advanced_index) {
     return ToPyObject(out);
@@ -1377,7 +1378,7 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
 
   // step3: Dealing with advanced indexing
   std::vector<paddle::Tensor> transed_index;
-  std::vector<int> trans_back_dim;
+  std::vector<int> trans_back_dim, trans_dim;
   int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1;
 
   paddle::Tensor transed_tensor = dealWithAdvancedIndex(out,
@@ -1387,7 +1388,9 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
                                                         &transed_index,
                                                         &trans_back_dim,
                                                         &pos_of_new_dim,
-                                                        &rank_of_new_dim);
+                                                        &rank_of_new_dim,
+                                                        &trans_dim,
+                                                        &out_is_view);
 
   if (transed_index.size() == 1 &&
       transed_index[0].dtype() == phi::DataType::BOOL) {
@@ -1417,14 +1420,14 @@ static PyObject* tensor__getitem_dygraph(TensorObject* self,
 
   if (pos_of_new_dim != 0) {
     std::vector<int> perm(out.shape().size(), 0);
-    int tmp1 = pos_of_new_dim, tmp2 = 0,
+    int tmp1 = rank_of_new_dim, tmp2 = 0,
         tmp3 = pos_of_new_dim + rank_of_new_dim;
     for (int i = 0; i < static_cast<int>(out.shape().size()); ++i) {
-      if (i < rank_of_new_dim) {
+      if (i < pos_of_new_dim) {
         perm[i] =
-            tmp1++;  // range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim)
-      } else if (i >= rank_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) {
-        perm[i] = tmp2++;  // range(0, pos_of_new_dim)
+            tmp1++;  // range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim)
+      } else if (i >= pos_of_new_dim && i < pos_of_new_dim + rank_of_new_dim) {
+        perm[i] = tmp2++;  // range(0, rank_of_new_dim)
       } else {
         perm[i] = tmp3++;  // range(pos_of_new_dim + rank_of_new_dim, out.ndim)
       }
@@ -1609,12 +1612,9 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
              &use_strided_slice);
 
   // step2: Parse values
-  PADDLE_ENFORCE(
-      PyCheckTensor(value_obj),
-      platform::errors::InvalidArgument("The value must be a Tensor"));
-
+  std::vector<phi::Scalar> values;
   paddle::Tensor value_tensor =
-      reinterpret_cast<TensorObject*>(value_obj)->tensor;
+      dealWithValues(tensor, value_obj, &values, has_advanced_index);
 
   if (!has_advanced_index) {
     // use set_value OP if there is no advanced index
@@ -1622,45 +1622,60 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
     // Release gil and do tracing
     py::gil_scoped_release release;
     // use inplace set_value_ operator
-    if (value_tensor.initialized() &&
-        (self->tensor.dtype() != value_tensor.dtype())) {
-      if (egr::Controller::Instance().GetAMPLevel() !=
-          paddle::imperative::AmpLevel::O0) {
-        paddle::small_vector<std::vector<paddle::Tensor>,
-                             egr::kSlotSmallVectorSize>
-            tmps = {{self->tensor}, {value_tensor}};
-        auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps);
-        self->tensor = egr::EagerAmpAutoCast(
-            self->tensor.name(), self->tensor, amp_dtype, "set_value");
-        value_tensor = egr::EagerAmpAutoCast(
-            value_tensor.name(), value_tensor, amp_dtype, "set_value");
-      }
+    if (value_tensor.initialized()) {
       if (self->tensor.dtype() != value_tensor.dtype()) {
-        value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        if (egr::Controller::Instance().GetAMPLevel() !=
+            paddle::imperative::AmpLevel::O0) {
+          paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+              tmps = {{self->tensor}, {value_tensor}};
+          auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps);
+          self->tensor = egr::EagerAmpAutoCast(
+              self->tensor.name(), self->tensor, amp_dtype, "set_value");
+          value_tensor = egr::EagerAmpAutoCast(
+              value_tensor.name(), value_tensor, amp_dtype, "set_value");
+        }
+        if (self->tensor.dtype() != value_tensor.dtype()) {
+          value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        }
       }
-    }
 
-    // step3.1: Only basic indexing, use OP set_value.
-    const phi::distributed::ProcessMesh* mesh = nullptr;
-    if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) {
-      ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor);
-    }
-    self->tensor = set_value_with_tensor__ad_func(self->tensor,
-                                                  value_tensor,
-                                                  slice_starts,
-                                                  slice_ends,
-                                                  slice_strides,
-                                                  slice_axes,
-                                                  decrease_axis,
-                                                  none_axes);
-    if (PyCheckTensor(value_obj)) {
-      // pass the stop_gradient from value to tensor.
-      // pass stop gradient should be done after CheckInplace in
-      // set_value__dygraph_function.
-      if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
-          egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
-        egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+      // step3.1: Only basic indexing, use OP set_value.
+      const phi::distributed::ProcessMesh* mesh = nullptr;
+      if (InputsContainDistTensor(&mesh, self->tensor, value_tensor)) {
+        ConvertAllInputsToDistTensor(mesh, self->tensor, value_tensor);
       }
+      self->tensor = set_value_with_tensor__ad_func(self->tensor,
+                                                    value_tensor,
+                                                    slice_starts,
+                                                    slice_ends,
+                                                    slice_strides,
+                                                    slice_axes,
+                                                    decrease_axis,
+                                                    none_axes);
+      if (PyCheckTensor(value_obj)) {
+        // pass the stop_gradient from value to tensor.
+        // pass stop gradient should be done after CheckInplace in
+        // set_value__dygraph_function.
+        if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
+            egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
+          egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+        }
+      }
+    } else {
+      const phi::distributed::ProcessMesh* mesh = nullptr;
+      if (InputsContainDistTensor(&mesh, self->tensor)) {
+        ConvertAllInputsToDistTensor(mesh, self->tensor);
+      }
+      self->tensor = set_value__ad_func(self->tensor,
+                                        slice_starts,
+                                        slice_ends,
+                                        slice_strides,
+                                        slice_axes,
+                                        decrease_axis,
+                                        none_axes,
+                                        {1},
+                                        values);
     }
   } else {
     // step3.2: Case for there are advanced indexing.
@@ -1670,6 +1685,7 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
     //   3. assign values to the sliced result by index_put OP;
     //   4. transpose back and assign the result to original tensor by set_value
     //   OP.
+    bool out_is_view = false;
     paddle::Tensor sub_tensor = getTensorWithBasicIndexing(tensor,
                                                            &slice_axes,
                                                            &slice_starts,
@@ -1678,12 +1694,13 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
                                                            &decrease_axis,
                                                            &none_axes,
                                                            &infer_flags,
-                                                           &use_strided_slice);
+                                                           &use_strided_slice,
+                                                           &out_is_view);
 
     std::vector<paddle::Tensor> transed_index;
-    std::vector<int> trans_back_dim;
+    std::vector<int> trans_back_dim, trans_dim;
 
-    int pos_of_new_dim = 0, rank_of_new_dim = 0;
+    int pos_of_new_dim = INT_MAX, rank_of_new_dim = 1;
 
     paddle::Tensor transed_sub_tensor =
         dealWithAdvancedIndex(sub_tensor,
@@ -1693,61 +1710,127 @@ static PyObject* tensor__setitem_dygraph(TensorObject* self,
                               &transed_index,
                               &trans_back_dim,
                               &pos_of_new_dim,
-                              &rank_of_new_dim);
+                              &rank_of_new_dim,
+                              &trans_dim,
+                              &out_is_view);
 
     // Release gil and do tracing
     py::gil_scoped_release release;
-
-    if (value_tensor.initialized() &&
-        (self->tensor.dtype() != value_tensor.dtype())) {
-      if (egr::Controller::Instance().GetAMPLevel() !=
-          paddle::imperative::AmpLevel::O0) {
-        paddle::small_vector<std::vector<paddle::Tensor>,
-                             egr::kSlotSmallVectorSize>
-            tmps = {{self->tensor}, {value_tensor}};
-        auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps);
-        self->tensor = egr::EagerAmpAutoCast(
-            self->tensor.name(), self->tensor, amp_dtype, "index_put");
-        value_tensor = egr::EagerAmpAutoCast(
-            value_tensor.name(), value_tensor, amp_dtype, "index_put");
-      }
+    if (value_tensor.initialized()) {
       if (self->tensor.dtype() != value_tensor.dtype()) {
-        value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        if (egr::Controller::Instance().GetAMPLevel() !=
+            paddle::imperative::AmpLevel::O0) {
+          paddle::small_vector<std::vector<paddle::Tensor>,
+                               egr::kSlotSmallVectorSize>
+              tmps = {{self->tensor}, {value_tensor}};
+          auto amp_dtype = egr::GetAmpDestDtype("index_put", tmps);
+          self->tensor = egr::EagerAmpAutoCast(
+              self->tensor.name(), self->tensor, amp_dtype, "index_put");
+          value_tensor = egr::EagerAmpAutoCast(
+              value_tensor.name(), value_tensor, amp_dtype, "index_put");
+        }
+        if (self->tensor.dtype() != value_tensor.dtype()) {
+          value_tensor = cast_ad_func(value_tensor, self->tensor.dtype());
+        }
       }
-    }
 
-    // TODO(zoooo0820) 1.Using inplace version index_put
-    //                  2.Remove following code after backward bug fixed.
-    transed_sub_tensor = assign_ad_func(transed_sub_tensor);
+      if (value_tensor.dims().size() > 1 && pos_of_new_dim != 0) {
+        value_tensor = transpose_ad_func(value_tensor, trans_dim);
+      }
 
-    const phi::distributed::ProcessMesh* mesh = nullptr;
-    if (InputsContainDistTensor(
-            &mesh, self->tensor, transed_sub_tensor, value_tensor)) {
-      ConvertAllInputsToDistTensor(
-          mesh, self->tensor, transed_sub_tensor, value_tensor);
-    }
+      const phi::distributed::ProcessMesh* mesh = nullptr;
+      if (InputsContainDistTensor(
+              &mesh, self->tensor, transed_sub_tensor, value_tensor)) {
+        ConvertAllInputsToDistTensor(
+            mesh, self->tensor, transed_sub_tensor, value_tensor);
+      }
 
-    transed_sub_tensor =
-        index_put_ad_func(transed_sub_tensor, transed_index, value_tensor);
-
-    paddle::Tensor transback_sub_tensor =
-        transpose_ad_func(transed_sub_tensor, trans_back_dim);
-
-    self->tensor = set_value_with_tensor__ad_func(self->tensor,
-                                                  transback_sub_tensor,
-                                                  slice_starts,
-                                                  slice_ends,
-                                                  slice_strides,
-                                                  slice_axes,
-                                                  decrease_axis,
-                                                  none_axes);
-    if (PyCheckTensor(value_obj)) {
-      // pass the stop_gradient from value to tensor.
-      // pass stop gradient should be done after CheckInplace in
-      // set_value__dygraph_function.
-      if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
-          egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
-        egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+      if (transed_index.size() == 1 &&
+          transed_index[0].dtype() == phi::DataType::BOOL &&
+          transed_index[0].shape().size() == self->tensor.shape().size()) {
+        if (value_tensor.shape() != self->tensor.shape()) {
+          value_tensor = expand_ad_func(value_tensor, self->tensor.shape());
+        }
+        transed_sub_tensor =
+            where__ad_func(logical_not_ad_func(transed_index[0]),
+                           transed_sub_tensor,
+                           value_tensor);
+      } else {
+        transed_sub_tensor =
+            index_put__ad_func(transed_sub_tensor, transed_index, value_tensor);
+      }
+
+      if (out_is_view) {
+        // NOTE(zoooo0820): if out_is_view is true, it is a case of
+        // combined-indexing setitem, i.e. firstly we get a view of
+        // self->tensor, then modified it with inplace api index_put_ For now,
+        // in design of Paddle, the forward result is right. But the backward
+        // edge can not be established because the Base Tensor cannot sense
+        // whether it has been modified by other operations. Following codes are
+        // to add a new node (set_value_with_tensor_grad) to record the backward
+        // edge, with out ad_function which needs to do the forward calculation.
+
+        egr::AutogradMeta* x_autograd_meta =
+            egr::EagerUtils::nullable_autograd_meta(self->tensor);
+        egr::AutogradMeta* values_autograd_meta =
+            egr::EagerUtils::nullable_autograd_meta(transed_sub_tensor);
+        bool trace_backward = egr::Controller::Instance().HasGrad();
+        bool require_any_grad = egr::EagerUtils::ComputeRequireGrad(
+            trace_backward, x_autograd_meta, values_autograd_meta);
+        // Node Declaration
+        std::shared_ptr<SetValueWithTensorGradNode> grad_node;
+        // Set grad_node before API Call
+        if (require_any_grad) {
+          paddle::Tensor transback_sub_tensor =
+              transpose_ad_func(transed_sub_tensor, trans_back_dim);
+          const auto& values_tmp =
+              (require_any_grad && transback_sub_tensor.is_dense_tensor() &&
+               !std::dynamic_pointer_cast<phi::DenseTensor>(
+                    transback_sub_tensor.impl())
+                    ->meta()
+                    .is_contiguous())
+                  ? paddle::Tensor(
+                        std::make_shared<phi::DenseTensor>(
+                            std::move(paddle::experimental::Trans2Contiguous(
+                                *(std::dynamic_pointer_cast<phi::DenseTensor>(
+                                    transback_sub_tensor.impl()))))),
+                        transback_sub_tensor.mutable_autograd_meta())
+                  : transback_sub_tensor;
+
+          grad_node = std::shared_ptr<SetValueWithTensorGradNode>(
+              new SetValueWithTensorGradNode(1, 2));  // NOLINT
+          grad_node->SetAttributestarts(slice_starts);
+          grad_node->SetAttributeends(slice_ends);
+          grad_node->SetAttributesteps(slice_strides);
+          grad_node->SetAttributeaxes(slice_axes);
+          grad_node->SetAttributedecrease_axes(decrease_axis);
+          grad_node->SetAttributenone_axes(none_axes);
+          grad_node->SetTensorWrappervalues(values_tmp);
+
+          paddle::memory::LogDeviceMemoryStats(
+              egr::Controller::Instance().GetExpectedPlace(),
+              "set_value_with_tensor");
+          egr::EagerUtils::CheckInplace(
+              self->tensor, x_autograd_meta, require_any_grad);
+          egr::EagerUtils::PassStopGradient(false, x_autograd_meta);
+          // SetGradOutMeta & SetEdges
+          grad_node->SetGradOutMeta(self->tensor, 0);
+          grad_node->SetGradOutMeta(transback_sub_tensor, 1);
+          if (x_autograd_meta) {
+            egr::EagerUtils::SetOutRankWithSlot(x_autograd_meta, 0);
+            egr::EagerUtils::SetHistory(x_autograd_meta, grad_node);
+          }
+          grad_node->SetGradInMeta(self->tensor, 0);
+        }
+      }
+      if (PyCheckTensor(value_obj)) {
+        // pass the stop_gradient from value to tensor.
+        // pass stop gradient should be done after CheckInplace in
+        // set_value__dygraph_function.
+        if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
+            egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
+          egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
+        }
       }
     }
   }
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 05374b08d8fc25..520fe09bc710cd 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -40,7 +40,7 @@ void BindGenerator(py::module* m_ptr) {
            [](std::shared_ptr<phi::Generator::GeneratorState>& self) {
              return self->current_seed;
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
       // NOTE(shenliang03): Due to the inability to serialize mt19937_64
       // type, resulting in a problem with precision under the cpu.
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 7199eb13c579bc..8ba56008fb2b0b 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -869,7 +869,7 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) ||     \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
   py::class_<imperative::ParallelContext,
@@ -898,7 +898,7 @@ void BindImperative(py::module *m_ptr) {
         py::call_guard<py::gil_scoped_release>());
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   py::class_<imperative::NCCLParallelContext,
              imperative::ParallelContext,
              std::shared_ptr<imperative::NCCLParallelContext>>(
@@ -951,7 +951,7 @@ void BindImperative(py::module *m_ptr) {
            py::arg("ring_id"));
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   py::class_<imperative::HeterParallelContext,
              imperative::ParallelContext,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 1db2ab7f871c69..49b5ba774ea3c1 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -43,7 +43,7 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -693,7 +693,7 @@ void BindPaddlePredictor(py::module *m) {
       .def("get_output_names", &PaddlePredictor::GetOutputNames)
       .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
       .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("clone",
            [](PaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -743,7 +743,7 @@ void BindNativePredictor(py::module *m) {
       .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
       .def("clone",
            [](NativePaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("clone",
            [](NativePaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -790,7 +790,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::Exp_DisableMixedPrecisionOps)
       .def("exp_enable_mixed_precision_ops",
            &AnalysisConfig::Exp_EnableMixedPrecisionOps)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("set_exec_stream",
            [](AnalysisConfig &self, phi::CUDAStream &stream) {
              self.SetExecStream(stream.raw_stream());
@@ -800,7 +800,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::EnableXpu,
            py::arg("l3_size") = 16 * 1024 * 1024,
            py::arg("l3_locked") = false,
-           py::arg("conv_autotune") = true,
+           py::arg("conv_autotune") = false,
            py::arg("conv_autotune_file") = "",
            py::arg("transformer_encoder_precision") = "int16",
            py::arg("transformer_encoder_adaptive_seqlen") = false,
@@ -1143,7 +1143,7 @@ void BindAnalysisPredictor(py::module *m) {
            &AnalysisPredictor::analysis_argument,
            py::return_value_policy::reference)
       .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("clone",
            [](AnalysisPredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -1190,7 +1190,7 @@ void BindPaddleInferPredictor(py::module *m) {
            })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("clone",
            [](paddle_infer::Predictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index f94727e57167f1..5b8d169d91f746 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -126,11 +126,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#ifndef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 9cb6b8f076dcec..1c4315e8ee1851 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -126,11 +126,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#ifndef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -319,7 +319,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
              if (UNLIKELY(dev_id < 0)) {
                LOG(ERROR) << string::Sprintf(
                    "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -358,7 +358,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              std::exit(-1);
 #endif
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("get_device_id",
            [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -373,10 +373,10 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 53 support float16
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
     return true;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 53;
@@ -384,7 +384,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   });
   m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 80 support bfloat16
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
     return false;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 80;
@@ -546,7 +546,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudapinnedplace
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot use CUDAPinnedPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h
index 7b9002feed8ed7..3ba9ec3239c371 100644
--- a/paddle/fluid/pybind/process_group_utils.h
+++ b/paddle/fluid/pybind/process_group_utils.h
@@ -268,7 +268,7 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     ConcatDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                               tensor_list,
                               dense_tensor,
@@ -325,7 +325,7 @@ void SplitTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     SplitDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                              tensor,
                              &dense_list,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 7949d7c1c33946..feafd1fa4333e6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -134,7 +134,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -146,11 +146,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -238,7 +238,7 @@ bool IsCompiledWithAVX() {
 }
 
 bool IsCompiledWithCUDA() {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   return false;
 #else
   return true;
@@ -279,15 +279,7 @@ bool IsCompiledWithMPIAWARE() {
 }
 
 bool IsCompiledWithROCM() {
-#if !defined(PADDLE_WITH_HIP)
-  return false;
-#else
-  return true;
-#endif
-}
-
-bool IsCompiledWithMUSA() {
-#if !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
   return false;
 #else
   return true;
@@ -683,16 +675,16 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
                         string::join_strings(ops, ',')));
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 static int GetNCCLVersion() {
-// #if NCCL_VERSION_CODE >= 2304
+#if NCCL_VERSION_CODE >= 2304
   int ver;
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetVersion(&ver));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetVersion(&ver));
   return ver;
-// #else
-//   PADDLE_THROW(platform::errors::External(
-//       "Cannot get NCCL version successfully when nccl version < 2.3.4"));
-// #endif
+#else
+  PADDLE_THROW(platform::errors::External(
+      "Cannot get NCCL version successfully when nccl version < 2.3.4"));
+#endif
 }
 #endif
 
@@ -938,7 +930,7 @@ PYBIND11_MODULE(libpaddle, m) {
         return self->OutputMeta();
       });
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("cudnn_version", &platform::DnnVersion);
   m.def("gpu_memory_available", []() {
     size_t available = 0;
@@ -948,7 +940,7 @@ PYBIND11_MODULE(libpaddle, m) {
   });
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   m.def("nccl_version", &GetNCCLVersion);
 #endif
 
@@ -990,7 +982,7 @@ PYBIND11_MODULE(libpaddle, m) {
     if (dl.device.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (dl.device.device_type == kDLGPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
@@ -1264,7 +1256,7 @@ All parameter, weight, gradient are variables in Paddle.
           "get_fetch_list",
           [](Variable &self) { return self.GetMutable<FetchList>(); },
           py::return_value_policy::reference)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       .def(
           "get_communicator",
           [](Variable &self) -> platform::Communicator * {
@@ -1732,7 +1724,7 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1766,7 +1758,7 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPinnedPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPinnedPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1774,7 +1766,7 @@ All parameter, weight, gradient are variables in Paddle.
                   return new paddle::platform::CUDAPinnedDeviceContext(place);
 #endif
           });
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   m.def("get_all_device_type", []() {
@@ -2114,7 +2106,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_avx", IsCompiledWithAVX);
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
-  m.def("is_compiled_with_musa", IsCompiledWithMUSA);
   m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
   m.def("is_compiled_with_ipu", IsCompiledWithIPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
@@ -2393,7 +2384,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
   m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
   m.def("cuda_empty_cache", [] {
@@ -2439,7 +2430,7 @@ All parameter, weight, gradient are variables in Paddle.
         return ostr.str();
       });
 
-#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) && !defined(_WIN32)
+#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
@@ -2521,7 +2512,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("save", &paddle::platform::ProfilerResult::Save)
       .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
       .def("get_version", &paddle::platform::ProfilerResult::GetVersion)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
       .def("get_device_property",
            &paddle::platform::ProfilerResult::GetDeviceProperty);
@@ -2678,7 +2669,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder);
   m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
   m.def("get_cublas_switch", phi::AllowTF32Cublas);
   m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
@@ -2966,7 +2957,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_BOX_PS
   BindBoxWrapper(&m);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   BindNCCLWrapper(&m);
 #endif
 #ifdef PADDLE_WITH_GLOO
diff --git a/paddle/fluid/pybind/slice_utils.h b/paddle/fluid/pybind/slice_utils.h
index 918d2eeae4272a..919a3a4650d3e7 100644
--- a/paddle/fluid/pybind/slice_utils.h
+++ b/paddle/fluid/pybind/slice_utils.h
@@ -26,9 +26,11 @@
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/scope_guard.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
@@ -345,11 +347,13 @@ static paddle::Tensor getTensorWithBasicIndexing(
     std::vector<int64_t>* decrease_axis,
     std::vector<int64_t>* none_axes,
     std::vector<int64_t>* infer_flags,
-    bool* use_strided_slice) {
+    bool* use_strided_slice,
+    bool* out_is_view) {
   paddle::Tensor out;
   if (slice_axes->empty()) {
     out = tensor;
   } else {
+    *out_is_view = true;
     if (!(*use_strided_slice)) {
       eager_gil_scoped_release guard;
       out = slice_ad_func(tensor,
@@ -370,6 +374,7 @@ static paddle::Tensor getTensorWithBasicIndexing(
     }
   }
   if (!none_axes->empty()) {
+    *out_is_view = true;
     eager_gil_scoped_release guard;
     // Deal with cases that decrease_axes is not empty
     // For example:
@@ -397,9 +402,9 @@ static paddle::Tensor dealWithAdvancedIndex(
     std::vector<paddle::Tensor>* transed_index,
     std::vector<int>* trans_back_dim,
     int* pos_of_new_dim,
-    int* rank_of_new_dim) {
-  std::vector<int> trans_dim;
-
+    int* rank_of_new_dim,
+    std::vector<int>* trans_dim,
+    bool* out_is_view) {
   int p = 0;
   for (size_t i = 0; i < advanced_index_dim->size(); ++i) {
     auto index_dim = (*advanced_index_dim)[i];
@@ -408,30 +413,28 @@ static paddle::Tensor dealWithAdvancedIndex(
       // advanced_index_dim
       auto index = (*advanced_index)[p++];
 
-      if (!is_for_setitem) {
-        if (index_dim == 0) {
-          // case 1: advanced indices at axis 0, the new dim will be at first.
-          *pos_of_new_dim = 0;
-        } else if (index_dim > 0 && trans_dim.size() > 0 &&
-                   trans_dim[trans_dim.size() - 1] != index_dim - 1) {
-          // case 2: there are not adjacent advanced indices, the new dim will
-          // be at first.
-          *pos_of_new_dim = 0;
-        } else {
-          *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim);
-        }
-        *rank_of_new_dim =
-            std::max(*rank_of_new_dim, static_cast<int>(index.shape().size()));
+      if (index_dim == 0) {
+        // case 1: advanced indices at axis 0, the new dim will be at first.
+        *pos_of_new_dim = 0;
+      } else if (index_dim > 0 && trans_dim->size() > 0 &&
+                 (*trans_dim)[trans_dim->size() - 1] != index_dim - 1) {
+        // case 2: there are not adjacent advanced indices, the new dim will
+        // be at first.
+        *pos_of_new_dim = 0;
+      } else {
+        *pos_of_new_dim = std::min(index_dim, *pos_of_new_dim);
       }
+      *rank_of_new_dim =
+          std::max(*rank_of_new_dim, static_cast<int>(index.shape().size()));
 
-      trans_dim.push_back(index_dim);
+      trans_dim->push_back(index_dim);
       transed_index->push_back(std::move(index));
     }
   }
 
   for (size_t i = 0; i < tensor.shape().size(); ++i) {
     if ((*advanced_index_dim)[i] == -1) {
-      trans_dim.push_back(i);
+      trans_dim->push_back(i);
     }
   }
 
@@ -441,19 +444,20 @@ static paddle::Tensor dealWithAdvancedIndex(
   std::vector<int> original_dim_order(tensor.shape().size());
   std::iota(original_dim_order.begin(), original_dim_order.end(), 0);
 
-  if (original_dim_order == trans_dim) {
+  if (original_dim_order == *trans_dim) {
     transed_tensor = tensor;
   } else {
-    transed_tensor = transpose_ad_func(tensor, trans_dim);
+    *out_is_view = true;
+    transed_tensor = transpose_ad_func(tensor, *trans_dim);
   }
 
   if (is_for_setitem) {
-    trans_back_dim->resize(trans_dim.size());
+    trans_back_dim->resize(trans_dim->size());
     std::iota(trans_back_dim->begin(), trans_back_dim->end(), 0);
     std::sort(trans_back_dim->begin(),
               trans_back_dim->end(),
               [&trans_dim](int left, int right) {
-                return trans_dim[left] < trans_dim[right];
+                return (*trans_dim)[left] < (*trans_dim)[right];
               });
   }
   return transed_tensor;
@@ -511,5 +515,104 @@ static void ParseBoolAndBroadcastIndices(
   }
 }
 
+static paddle::Tensor dealWithValues(const paddle::Tensor& tensor,
+                                     PyObject* value_obj,
+                                     std::vector<phi::Scalar>* values,
+                                     const bool trans_to_tensor) {
+  paddle::Tensor value_tensor;
+  if (PyCheckTensor(value_obj)) {
+    value_tensor = reinterpret_cast<TensorObject*>(value_obj)->tensor;
+  } else if (py::isinstance<py::array>(value_obj)) {
+    paddle::Tensor value_tensor_tmp(
+        std::make_shared<phi::DenseTensor>(),
+        egr::Controller::Instance().GenerateUniqueName());
+    py::object value_obj_tmp(py::handle(value_obj), true);
+    py::object value = value_obj_tmp;
+    if (tensor.dtype() == phi::DataType::FLOAT32) {
+      if (!py::isinstance<py::array_t<float>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<float>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::FLOAT64) {
+      if (!py::isinstance<py::array_t<double>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<double>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::INT32) {
+      if (!py::isinstance<py::array_t<int32_t>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<int32_t>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::INT64) {
+      if (!py::isinstance<py::array_t<int64_t>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<int64_t>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::BOOL) {
+      if (!py::isinstance<py::array_t<bool>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<bool>(value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::COMPLEX64) {
+      if (!py::isinstance<py::array_t<std::complex<float>>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<std::complex<float>>(
+            value_obj_tmp);
+      }
+    } else if (tensor.dtype() == phi::DataType::COMPLEX128) {
+      if (!py::isinstance<py::array_t<std::complex<double>>>(value_obj_tmp)) {
+        value = pybind11::detail::CastNumpyArray<std::complex<double>>(
+            value_obj_tmp);
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "When assign a numpy.np value to a paddle.Tensor, "
+          "the data type of the paddle.Tensor must be bool, "
+          "float32, float64, complex64, complex128, int32 or int64, "
+          "please check the type of tensor."));
+    }
+    SetTensorFromPyArray(
+        static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+        value,
+        tensor.place(),
+        false);
+    value_tensor = value_tensor_tmp;
+  } else {
+    py::object value_obj_tmp(py::handle(value_obj), true);
+    // convert the value to self data type
+    if (py::isinstance<py::float_>(value_obj_tmp) ||
+        py::isinstance<py::int_>(value_obj_tmp) ||
+        py::isinstance<py::bool_>(value_obj_tmp) ||
+        PyComplex_Check(value_obj)) {
+      if (tensor.dtype() == phi::DataType::FLOAT32 ||
+          tensor.dtype() == phi::DataType::FLOAT16 ||
+          tensor.dtype() == phi::DataType::BFLOAT16) {
+        values->push_back(value_obj_tmp.cast<float>());
+      } else if (tensor.dtype() == phi::DataType::FLOAT64) {
+        values->push_back(value_obj_tmp.cast<double>());
+      } else if (tensor.dtype() == phi::DataType::INT32 ||
+                 tensor.dtype() == phi::DataType::INT16 ||
+                 tensor.dtype() == phi::DataType::INT8 ||
+                 tensor.dtype() == phi::DataType::UINT8) {
+        values->push_back(value_obj_tmp.cast<float>());
+      } else if (tensor.dtype() == phi::DataType::INT64) {
+        values->push_back(value_obj_tmp.cast<double>());
+      } else if (tensor.dtype() == phi::DataType::BOOL) {
+        values->push_back(value_obj_tmp.cast<bool>());
+      } else if (tensor.dtype() == phi::DataType::COMPLEX64) {
+        values->push_back(value_obj_tmp.cast<std::complex<float>>());
+      } else if (tensor.dtype() == phi::DataType::COMPLEX128) {
+        values->push_back(value_obj_tmp.cast<std::complex<double>>());
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Value type error. The assign value allows "
+          "Tensor, numpy.ndarray, integer, float, complex or bool, "
+          "but received %s.",
+          Py_TYPE(value_obj)));
+    }
+
+    if (trans_to_tensor) {
+      value_tensor =
+          full_ad_func({1}, (*values)[0], tensor.dtype(), tensor.place());
+    }
+  }
+  return value_tensor;
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index cce09cf7fdfd54..44983e3e13df7f 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -126,11 +126,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -1101,7 +1101,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
       .def("height", &phi::SelectedRows::height)
       .def("set_rows",
            [](phi::SelectedRows &self, std::vector<int64_t> rows) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
              self.set_rows(rows);
 #else
         std::vector<int64_t> new_rows(rows);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 622d054645eff1..dd5bd7f1d91c4d 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/pybind/complex.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
@@ -325,7 +325,7 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
 #endif
   } else if (platform::is_gpu_place(self.place()) ||
              platform::is_cuda_pinned_place(self.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     const T *a = self.data<T>();
     auto p = self.place();
     paddle::memory::Copy(
@@ -362,7 +362,7 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
 #endif
   } else if (platform::is_gpu_place(self->place()) ||
              platform::is_cuda_pinned_place(self->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(
@@ -457,7 +457,7 @@ void SetTensorFromPyArrayT(
         "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (paddle::platform::is_gpu_place(place)) {
       // NOTE(wangxi): When copying data to the accelerator card,
       // we need set_device(dev_id) first.
@@ -466,9 +466,6 @@ void SetTensorFromPyArrayT(
 #ifdef PADDLE_WITH_HIP
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-      paddle::platform::GpuMemcpySync(
-          dst, array.data(), array.nbytes(), musaMemcpyHostToDevice);
 #else
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice);
@@ -793,7 +790,7 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
     output->mutable_data(place, self.dtype());
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(place, self.dtype());
     } else if ((platform::is_gpu_place(place))) {
@@ -1050,13 +1047,11 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
         "Please recompile or reinstall Paddle with XPU support."));
 #endif
   } else if (is_gpu_tensor) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #if defined(PADDLE_WITH_CUDA)
     gpuMemcpyKind kind = cudaMemcpyDeviceToHost;
 #elif defined(PADDLE_WITH_HIP)
     gpuMemcpyKind kind = hipMemcpyDeviceToHost;
-#elif defined(PADDLE_WITH_MUSA)
-    gpuMemcpyKind kind = musaMemcpyDeviceToHost;
 #endif
     phi::DenseTensor cpu_tensor;
     platform::CPUPlace cpu_place;
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 8636de26c4161e..09b4337ecb40b3 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -51,13 +51,6 @@ if(WITH_GPU)
   list(APPEND PHI_DEPS external_error_proto)
 endif()
 
-if(WITH_MUSA)
-  set(DEPENDENT_LIBRARIES "")
-  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so")
-  list(APPEND PHI_DEPS ${DEPENDENT_LIBRARIES})
-endif()
-
-
 if(WITH_ASCEND_CL)
   list(APPEND PHI_DEPS npu_hccl)
 endif()
@@ -141,11 +134,11 @@ if(WITH_GPU)
     SRCS ${PHI_SRCS}
     DEPS ${PHI_DEPS})
 elseif(WITH_ROCM)
-  hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
-  target_link_libraries(phi ${PHI_DEPS})
-elseif(WITH_MUSA)
-  musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
-  target_link_libraries(phi ${PHI_DEPS})  
+  hip_library(
+    phi ${PHI_BUILD_TYPE}
+    SRCS ${PHI_SRCS}
+    DEPS ${PHI_DEPS})
+
 elseif(WITH_XPU_KP)
   xpu_library(
     phi ${PHI_BUILD_TYPE}
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index a6f8b3949c20a4..86ba7b9cf75764 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -99,7 +99,7 @@ namespace paddle {
  */
 PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /**
  * Get the current CUDA stream for the passed CUDA device.
  */
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index 3ef838410bed07..a6e78686e1e4ce 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -24,11 +24,6 @@ limitations under the License. */
 using gpuStream_t = cudaStream_t;
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-using gpuStream_t = musaStream_t;
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 using gpuStream_t = hipStream_t;
@@ -418,7 +413,7 @@ class PADDLE_API Tensor final {
    */
   void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /**
    * @brief Get the stream where the tensor is currently located
    * This is a deprecated method and may be removed in the future!
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index 2d5d1a49f02e77..ed64ff1c937b64 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -385,7 +385,7 @@ void TransStride(phi::DeviceContext* dev_ctx,
       delete from;
       return;
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto* gpu_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     if (gpu_ctx) {
       PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] {
@@ -437,7 +437,7 @@ void TransStrideLegacy(phi::DeviceContext* dev_ctx,
                          }));
       return;
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     auto* gpu_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     if (gpu_ctx) {
       PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] {
@@ -489,7 +489,7 @@ void TransStride(phi::DeviceContext* dev_ctx,
         delete from[i];
         continue;
       }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto* gpu_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
       if (gpu_ctx) {
         PD_VISIT_ALL_TYPES(to[i]->dtype(), "StridedCopyKernel", ([&] {
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index b2c3f9f28ee79c..ee1e21a58e2f1b 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -75,7 +75,7 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
   return const_cast<phi::Allocator*>(&dev_ctx->GetAllocator());  // NOLINT
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
   PADDLE_ENFORCE_EQ(place.GetType(),
                     phi::AllocationType::GPU,
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 2ea7ae4f5e3d84..03ac68d3319915 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -116,7 +116,7 @@ phi::DenseTensor CastDataType(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
                               const phi::DenseTensor& tensor,
                               DataType dtype) {
@@ -158,7 +158,7 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
   if (tensor.place().GetType() == phi::AllocationType::CPU) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
@@ -196,7 +196,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
           << " dst_place: " << dst_place;
 
   auto& pool = phi::DeviceContextPool::Instance();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // NOTE(yy): TransDataPlace should wait for computation of input.
   if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
     pool.Get(tensor.place())->Wait();
@@ -247,7 +247,7 @@ phi::DenseTensor Trans2Contiguous(const phi::DenseTensor& tensor) {
   if (tensor.place().GetType() == phi::AllocationType::CPU) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
     return TensorContiguous<phi::CPUContext>(*dev_ctx, tensor);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return TensorContiguous<phi::GPUContext>(*dev_ctx, tensor);
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index ee88e9fb1b0c88..49c47cbcce363c 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -376,7 +376,7 @@ void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
   impl_ = std::move(impl);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 gpuStream_t Tensor::stream() const {
   int device_id = phi::backends::gpu::GetCurrentDeviceId();
   auto *gpu_context = DeviceContextPool::Instance().Get<AllocationType::GPU>(
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index 0aad2a6da5fdce..9c11e88260c1df 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -20,11 +20,11 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #else
-#include <musa_runtime.h>
+#include <hip/hip_runtime.h>
 #endif
 #endif
 
@@ -33,26 +33,26 @@ namespace paddle {
 PD_REGISTER_API(from_blob)
 
 phi::Place GetPlaceFromPtr(void* data) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-// #ifdef PADDLE_WITH_CUDA
-// #if CUDA_VERSION >= 10000
-  musaPointerAttributes attr;
-  musaError_t status = musaPointerGetAttributes(&attr, data);
-  if (status == musaSuccess && attr.type == musaMemoryTypeDevice) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10000
+  cudaPointerAttributes attr;
+  cudaError_t status = cudaPointerGetAttributes(&attr, data);
+  if (status == cudaSuccess && attr.type == cudaMemoryTypeDevice) {
     return phi::GPUPlace(attr.device);
   }
-// #else
-//   PADDLE_THROW(
-//       phi::errors::Unimplemented("The GetPlaceFromPtr() method is only "
-//                                  "supported when CUDA version >= 10.0."));
-// #endif
-// #else
-//   hipPointerAttribute_t attr;
-//   hipError_t status = hipPointerGetAttributes(&attr, data);
-//   if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
-//     return phi::GPUPlace(attr.device);
-//   }
-// #endif
+#else
+  PADDLE_THROW(
+      phi::errors::Unimplemented("The GetPlaceFromPtr() method is only "
+                                 "supported when CUDA version >= 10.0."));
+#endif
+#else
+  hipPointerAttribute_t attr;
+  hipError_t status = hipPointerGetAttributes(&attr, data);
+  if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
+    return phi::GPUPlace(attr.device);
+  }
+#endif
 #endif
   return phi::CPUPlace();
 }
diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h
index 70fb4d948986c4..eb765ebdcb9dd3 100644
--- a/paddle/phi/api/profiler/event.h
+++ b/paddle/phi/api/profiler/event.h
@@ -27,10 +27,8 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -64,7 +62,7 @@ class Event {
   void set_name(std::string name) { name_ = name; }
   void set_role(EventRole role) { role_ = role; }
   std::string attr() const { return attr_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifndef PADDLE_WITH_CUPTI
   gpuEvent_t event() const { return event_; }
   int device() const { return device_; }
@@ -83,7 +81,7 @@ class Event {
   uint64_t cpu_ns_;
   bool visited_status_{false};
   std::string attr_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
 
@@ -139,14 +137,12 @@ class MemEvent {
 };
 
 class CudaEvent {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
  public:
   CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventCreateWithFlags(&event_, flags_);
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -156,8 +152,6 @@ class CudaEvent {
   explicit CudaEvent(unsigned int flags) : flags_(flags) {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventCreateWithFlags(&event_, flags_);    
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -167,8 +161,6 @@ class CudaEvent {
   ~CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(event_);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventDestroy(event_);
 #else
     cudaEventDestroy(event_);
 #endif
@@ -177,8 +169,6 @@ class CudaEvent {
   void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
@@ -193,14 +183,6 @@ class CudaEvent {
     if (err == hipErrorNotReady) {
       return false;
     }
-#elif defined(PADDLE_WITH_MUSA)
-    gpuError_t err = musaEventQuery(event_);
-    if (err == musaSuccess) {
-      return true;
-    }
-    if (err == musaErrorNotReady) {
-      return false;
-    }
 #else
     gpuError_t err = cudaEventQuery(event_);
     if (err == cudaSuccess) {
@@ -217,8 +199,6 @@ class CudaEvent {
   void Synchronize() {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
 #endif
@@ -228,8 +208,6 @@ class CudaEvent {
  private:
 #ifdef PADDLE_WITH_HIP
   unsigned int flags_ = hipEventDefault;
-#elif defined(PADDLE_WITH_MUSA)
-  unsigned int flags_ = musaEventDefault;
 #else
   unsigned int flags_ = cudaEventDefault;
 #endif
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 3a87826337465b..81339a24c50de8 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -944,8 +944,6 @@
     func : gather_nd_grad
   composite : gather_nd_grad(x, index, out_grad, x_grad)
   no_need_buffer : x
-  data_transform :
-    skip_transform : index
 
 - backward_op : gaussian_inplace_grad
   forward : gaussian_inplace(Tensor x, float mean=0, float std=1.0, int seed=0) -> Tensor(out)
@@ -1762,8 +1760,8 @@
   optional : boxes_num
 
 - backward_op : put_along_axis_grad
-  forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") -> Tensor(out)
-  args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce)
+  forward : put_along_axis (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true) -> Tensor(out)
+  args : (Tensor arr, Tensor indices, Tensor values, Tensor out, Tensor out_grad, int axis, str reduce, bool include_self)
   output : Tensor(arr_grad), Tensor(values_grad)
   infer_meta :
     func : GeneralBinaryGradInferMeta
diff --git a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
index 3769155eb27e11..c7ec9ace290ac7 100644
--- a/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/dist_bw_api_gen.py
@@ -425,6 +425,7 @@ def source_include(header_file_path, fw_header_file_path):
 #include "{fw_header_file_path}"
 #include "paddle/phi/infermeta/backward.h"
 #include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/fusion.h"
 
 #include "paddle/phi/api/profiler/event_tracing.h"
 #include "paddle/phi/api/profiler/supplement_tracing.h"
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 04cf57a88bb7cb..3f11781dfe88eb 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -614,14 +614,14 @@
 
 - backward_op : set_value_grad
   forward : set_value (Tensor x, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes, int64_t[] shape, Scalar[] values) -> Tensor(out)
-  args : (Tensor out_grad)
+  args : (Tensor out_grad, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes)
   output : Tensor(x_grad)
   infer_meta:
     func: UnchangedInferMeta
     param: [out_grad]
   kernel:
-    func: assign
-    param: [out_grad]
+    func: set_value_with_scalar_grad
+    param: [out_grad, starts, ends, steps, axes, decrease_axes, none_axes]
 
 - backward_op : set_value_with_tensor_grad
   forward: set_value_with_tensor (Tensor x, Tensor values, IntArray starts, IntArray ends, IntArray steps, int64_t[] axes, int64_t[] decrease_axes, int64_t[] none_axes) -> Tensor(out)
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index e4bbb15073f418..dfcdf65673e208 100755
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -2432,7 +2432,7 @@
   outputs :
     out : Result
   attrs :
-    {axis : Axis, reduce : Reduce}
+    {axis : Axis, reduce : Reduce, include_self: Include_self}
 
 - op : pylayer
   backward : pylayer_grad
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 092b3d71a60b4d..efc1b17714a854 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -2032,7 +2032,7 @@
   backward : psroi_pool_grad
 
 - op : put_along_axis
-  args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign")
+  args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign", bool include_self = true)
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index db0d463bc67156..ed47487553bee7 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT APPLE AND NOT WIN32)
   list(APPEND BACKENDS_SRCS device_code.cc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
@@ -16,9 +16,6 @@ if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   if(WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
   endif()
-  if(WITH_MUSA)
-    list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc)
-  endif()  
 endif()
 
 if(WITH_XPU)
@@ -52,7 +49,6 @@ list(
 
 if(WITH_GPU
    OR WITH_ROCM
-   OR WITH_MUSA
    OR WITH_CUSTOM_DEVICE)
   list(APPEND BACKENDS_SRCS device_base.cc)
 endif()
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index 9e8ecd48e453c5..7824fc3b160b10 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 bool allow_tf32_cublas = true;
 void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h
index a0537c779e52f7..52f0ced275ac5e 100644
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
@@ -28,7 +28,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
@@ -47,7 +47,7 @@ struct DefaultDeviceContextType<phi::CPUPlace> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <>
 struct DefaultDeviceContextType<phi::GPUPlace> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/phi/backends/custom/custom_device.cc b/paddle/phi/backends/custom/custom_device.cc
index 48bedd1bd939e4..ddbfc60f19f083 100644
--- a/paddle/phi/backends/custom/custom_device.cc
+++ b/paddle/phi/backends/custom/custom_device.cc
@@ -383,7 +383,7 @@ class CustomDevice : public DeviceInterface {
     void* ptr = nullptr;
     const auto device = &devices_pool[dev_id];
 
-    if (!pimpl_->unified_memory_allocate) {
+    if (!pimpl_->host_memory_allocate) {
       PADDLE_THROW(phi::errors::Unavailable(
           "MemoryAllocateHost is not supported on %s.", Type()));
     } else {
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index ac16a69aa7bee7..d160b5034f9986 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -78,8 +78,7 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
   }
   for (auto& p : set) {
     if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
@@ -89,14 +88,12 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   GPUDeviceCode::CheckAvailableStatus();
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef PADDLE_WITH_HIP
 static bool CheckCUDADriverResult(hipError_t result,
                                   std::string caller,
@@ -104,13 +101,6 @@ static bool CheckCUDADriverResult(hipError_t result,
   if (result != hipSuccess) {
     const char* error = nullptr;
     error = dynload::hipGetErrorString(result);
-#elif defined(PADDLE_WITH_MUSA)
-static bool CheckCUDADriverResult(MUresult result,
-                                  std::string caller,
-                                  std::string kernel_name = "") {
-  if (result != MUSA_SUCCESS) {
-    const char* error = nullptr;
-    dynload::muGetErrorString(result, &error);
 #else
 static bool CheckCUDADriverResult(CUresult result,
                                   std::string caller,
@@ -140,8 +130,6 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
-#elif defined(PADDLE_WITH_MUSA)
-  mtrtcResult nvrtc_result = dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor);
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -152,9 +140,6 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
   if (driver_result == hipSuccess) {
-#elif defined(PADDLE_WITH_MUSA)
-  MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
-  if (driver_result == MUSA_SUCCESS) {
 #else
   CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
   if (driver_result == CUDA_SUCCESS) {
@@ -168,8 +153,6 @@ void GPUDeviceCode::CheckAvailableStatus() {
                        << "." << nvrtc_minor;
 #ifdef PADDLE_WITH_HIP
   if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
-#elif defined(PADDLE_WITH_MUSA)
-  if (nvrtc_result != MTRTC_SUCCESS || driver_result != MUSA_SUCCESS) {
 #else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
 #endif
@@ -180,9 +163,6 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
                             "hipGetDeviceCount")) {
-#elif defined(PADDLE_WITH_MUSA)
-  if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
-                            "muDeviceGetCount")) {
 #else
   if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
                             "cuDeviceGetCount")) {
@@ -222,8 +202,6 @@ static std::string FindCUDAIncludePath() {
 
 #ifdef PADDLE_WITH_HIP
   cuda_include_path = "/opt/rocm/include";
-#elif defined(PADDLE_WITH_MUSA)
-  cuda_include_path = "/usr/local/musa/include";
 #else
   cuda_include_path = "/usr/local/cuda/include";
 #endif
@@ -251,8 +229,6 @@ GPUDeviceCode::GPUDeviceCode(const Place& place,
   name_ = name;
 #ifdef PADDLE_WITH_HIP
   kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
-#elif defined(PADDLE_WITH_MUSA)
-  kernel_ = kernel;
 #else
   kernel_ = kernel;
 #endif
@@ -281,12 +257,12 @@ bool GPUDeviceCode::Compile(bool include_path) {
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
   int compute_capability = dev_ctx->GetComputeCapability();
-  std::vector<const char*> options = {"-std=c++11", "--amdgpu-target=gfx906"};
+  std::vector<const char*> options = {"-std=c++11"};
   std::string include_option;
   if (include_path) {
     std::string cuda_include_path = FindCUDAIncludePath();
     if (!cuda_include_path.empty()) {
-      include_option = "--include-path=" + cuda_include_path;
+      include_option = "-I" + cuda_include_path;
       options.push_back(include_option.c_str());
     }
   }
@@ -342,86 +318,6 @@ bool GPUDeviceCode::Compile(bool include_path) {
           "hipModuleGetFunction")) {
     return false;
   }
-#elif defined(PADDLE_WITH_MUSA)
-  mtrtcProgram program;
-  if (!CheckNVRTCResult(dynload::mtrtcCreateProgram(&program,
-                                                    kernel_.c_str(),  // buffer
-                                                    name_.c_str(),    // name
-                                                    0,         // numHeaders
-                                                    nullptr,   // headers
-                                                    nullptr),  // includeNames
-                        "mtrtcCreateProgram")) {
-    return false;
-  }
-
-  // Compile the program for specified compute_capability
-  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
-      DeviceContextPool::Instance().Get(place_));
-  int compute_capability = dev_ctx->GetComputeCapability();
-  std::string compute_flag =
-      "--gpu-architecture=compute_" + std::to_string(compute_capability);
-  std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
-  std::string include_option;
-  if (include_path) {
-    std::string cuda_include_path = FindCUDAIncludePath();
-    if (!cuda_include_path.empty()) {
-      include_option = "--include-path=" + cuda_include_path;
-      options.push_back(include_option.c_str());
-    }
-  }
-  mtrtcResult compile_result =
-      dynload::mtrtcCompileProgram(program,          // program
-                                   options.size(),   // numOptions
-                                   options.data());  // options
-  if (compile_result == MTRTC_ERROR_COMPILATION) {
-    // Obtain compilation log from the program
-    size_t log_size;
-    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLogSize(program, &log_size),
-                          "mtrtcGetProgramLogSize")) {
-      return false;
-    }
-    std::vector<char> log;
-    log.resize(log_size + 1);
-    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLog(program, log.data()),
-                          "nvrtcGetProgramLog")) {
-      return false;
-    }
-    LOG(WARNING) << "JIT compiling of MUSA code failed:"
-                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
-                 << kernel_ << "\n  Compiling log: " << log.data();
-
-    return false;
-  }
-
-  // Obtain PTX from the program
-  size_t ptx_size;
-  if (!CheckNVRTCResult(dynload::mtrtcGetMUSASize(program, &ptx_size),
-                        "mtrtcGetMUSASize")) {
-    return false;
-  }
-  ptx_.resize(ptx_size + 1);
-  if (!CheckNVRTCResult(dynload::mtrtcGetMUSA(program, ptx_.data()),
-                        "mtrtcGetMUSA")) {
-    return false;
-  }
-
-  if (!CheckNVRTCResult(dynload::mtrtcDestroyProgram(&program),
-                        "mtrtcDestroyProgram")) {
-    return false;
-  }
-
-  if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()),
-                             "muModuleLoadData",
-                             name_)) {
-    return false;
-  }
-
-  if (!CheckCUDADriverResult(
-          dynload::muModuleGetFunction(&function_, module_, name_.c_str()),
-          "muModuleGetFunction",
-          name_)) {
-    return false;
-  }
 #else
   nvrtcProgram program;
   if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
@@ -540,22 +436,6 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       hipSuccess,
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_EQ(
-      dynload::muLaunchKernel(function_,
-                              num_blocks,
-                              1,
-                              1,  // grid dim
-                              num_threads_,
-                              1,
-                              1,                  // block dim
-                              0,                  // shared memory
-                              dev_ctx->stream(),  // stream
-                              args->data(),       // arguments
-                              nullptr),
-      MUSA_SUCCESS,
-      errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
-                       name_.c_str()));
 #else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_,
@@ -584,18 +464,6 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result,
         << " > failed: " << dynload::hiprtcGetErrorString(result);
     return false;
   }
-  return true;
-}
-#elif defined(PADDLE_WITH_MUSA)
-bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) {
-  if (result != MTRTC_SUCCESS) {
-    LOG_FIRST_N(WARNING, 1)
-        << "Call " << function << " for < " << name_
-        << " > failed: " << dynload::mtrtcGetErrorString(result);
-    return false;
-  }
-  return true;
-}
 #else
 bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
   if (result != NVRTC_SUCCESS) {
@@ -604,9 +472,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
         << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
+#endif
   return true;
 }
 #endif
-#endif
 
 }  // namespace phi
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 964124076e6057..8debb4dc9c45ee 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -26,20 +26,11 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 #include "paddle/phi/backends/dynload/nvrtc.h"
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/musa_driver.h"
-#include "paddle/phi/backends/dynload/musartc.h"
-#endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hiprtc.h"
 #include "paddle/phi/backends/dynload/rocm_driver.h"
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-// #include "paddle/phi/backends/dynload/hiprtc.h"
-// #include "paddle/phi/backends/dynload/rocm_driver.h"
-#endif
-
 namespace phi {
 
 class DeviceCode {
@@ -57,7 +48,7 @@ class DeviceCode {
   std::string kernel_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class GPUDeviceCode : public DeviceCode {
  public:
   explicit GPUDeviceCode(const Place& place,
@@ -77,8 +68,6 @@ class GPUDeviceCode : public DeviceCode {
  private:
 #ifdef PADDLE_WITH_HIP
   bool CheckNVRTCResult(hiprtcResult result, std::string function);
-#elif defined(PADDLE_WITH_MUSA)
-  bool CheckNVRTCResult(mtrtcResult result, std::string function);
 #else
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
 #endif
@@ -93,9 +82,6 @@ class GPUDeviceCode : public DeviceCode {
 #ifdef PADDLE_WITH_HIP
   hipModule_t module_;
   hipFunction_t function_;
-#elif defined(PADDLE_WITH_MUSA)
-  MUmodule module_;
-  MUfunction function_;
 #else
   CUmodule module_;
   CUfunction function_;
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index d731b6b6d1ecf9..c65e06364acd0e 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -36,7 +36,7 @@ inline size_t Alignment(size_t size,
     if (place.GetType() == phi::AllocationType::CPU) {
       alignment = phi::backends::cpu::CpuMinChunkSize();
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
       alignment = phi::backends::xpu::XPUMinChunkSize();
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 2ea6f11aa53a65..2db75d7022f0a5 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -30,17 +30,6 @@ if(WITH_ROCM)
     rocsparse.cc)
 endif()
 
-if(WITH_MUSA)
-  list(
-    APPEND
-    MUSA_SRCS
-    mublas.cc
-    mudnn.cc
-    murand.cc
-    mufft.cc
-    musparse.cc)
-endif()
-
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows.
 if(NOT APPLE)
@@ -57,15 +46,6 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
-  if(WITH_MUSA)
-    list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
-    if(WITH_MCCL)
-      list(APPEND MUSA_SRCS mccl.cc)
-    endif()
-    if(CUPTI_FOUND)
-      list(APPEND MUSA_SRCS cupti.cc)
-    endif()
-  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -113,8 +93,6 @@ if(WITH_ROCM)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS})
 elseif(WITH_GPU)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS})
-elseif(WITH_MUSA)
-  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS})  
 else()
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS})
 endif()
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 987f0eefc4397f..bdb9e120d2884b 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -102,29 +102,6 @@ PHI_DEFINE_string(rccl_dir,
                   "dlopen will search rccl from LD_LIBRARY_PATH");
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-
-PHI_DEFINE_string(mudnn_dir,
-                  "",
-                  "Specify path for loading libmudnn.so. For instance, "
-                  "/usr/local/musa/lib. If empty [default], dlopen "
-                  "will search mudnn from LD_LIBRARY_PATH");
-
-PHI_DEFINE_string(musa_dir,
-                  "",
-                  "Specify path for loading rocm library, such as libmublas, "
-                  "For instance, /usr/local/musa/lib. "
-                  "If default, dlopen will search rocm from LD_LIBRARY_PATH");
-
-PHI_DEFINE_string(mccl_dir,
-                  "",
-                  "Specify path for loading mccl library, such as libmccl.so. "
-                  "For instance, /usr/local/musa/lib. If default, "
-                  "dlopen will search rccl from LD_LIBRARY_PATH");
-#endif
-
-
-
 #ifdef PADDLE_WITH_XPU
 PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
 #endif
@@ -349,8 +326,6 @@ void* GetCublasDsoHandle() {
       FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
-#elif defined(PADDLE_WITH_MUSA)
-  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmublas.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -392,9 +367,6 @@ void* GetCUDNNDsoHandle() {
       FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
-#elif defined(PADDLE_WITH_MUSA)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cudnn_dir, "libmudnn.so", false, {cuda_lib_path});
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
@@ -419,8 +391,6 @@ void* GetCurandDsoHandle() {
       FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
-#elif defined(PADDLE_WITH_MUSA)
-  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmurand.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -436,12 +406,6 @@ void* GetROCFFTDsoHandle() {
 }
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-void* GetMUFFTDsoHandle() {
-  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmufft.so");
-}
-#endif
-
 void* GetNvjpegDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
@@ -472,8 +436,6 @@ void* GetCusparseDsoHandle() {
       FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
-#elif defined(PADDLE_WITH_MUSA)
-  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusparse.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
 #endif
@@ -484,8 +446,6 @@ void* GetNVRTCDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#elif defined(PADDLE_WITH_MUSA)
-  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusart.so", false);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
@@ -496,8 +456,6 @@ void* GetCUDADsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#elif defined(PADDLE_WITH_MUSA)
-  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusa.so", false);
 #elif defined(_WIN32)
   char system32_dir[MAX_PATH];
   GetSystemDirectory(system32_dir, MAX_PATH);
@@ -555,9 +513,6 @@ void* GetNCCLDsoHandle() {
       "You may need to install 'rccl' from ROCM official website: "
       "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
       "Installation-Guide.html before install PaddlePaddle.");
-#elif defined(PADDLE_WITH_MUSA)
-  std::string warning_msg(
-      "You may need to install 'mccl' from musa official website.");
 #else
   std::string warning_msg(
       "You may need to install 'nccl2' from NVIDIA official website: "
@@ -571,9 +526,6 @@ void* GetNCCLDsoHandle() {
 #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
   return GetDsoHandleFromSearchPath(
       FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
-#elif defined(PADDLE_WITH_MUSA) && defined(PADDLE_WITH_MCCL)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_mccl_dir, "libmccl.so", true, {}, warning_msg);
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h
index 02da303b2020f9..6ddeb1386410f0 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.h
+++ b/paddle/phi/backends/dynload/dynamic_loader.h
@@ -48,7 +48,6 @@ void* GetMKLRTDsoHandle();
 void* GetROCFFTDsoHandle();
 void* GetCusparseLtDsoHandle();
 void* GetXPTIDsoHandle();
-void* GetMUFFTDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 
diff --git a/paddle/phi/backends/dynload/mccl.cc b/paddle/phi/backends/dynload/mccl.cc
deleted file mode 100644
index 3bf5fd8c985d12..00000000000000
--- a/paddle/phi/backends/dynload/mccl.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/mccl.h"
-
-namespace phi {
-namespace dynload {
-
-std::once_flag mccl_dso_flag;
-void *mccl_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
-
-MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
-
-MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
-
-MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h
deleted file mode 100644
index 4e2eaeea00afa3..00000000000000
--- a/paddle/phi/backends/dynload/mccl.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <mccl.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag mccl_dso_flag;
-extern void* mccl_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)                   \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      using nccl_func = decltype(&::__name);                     \
-      std::call_once(mccl_dso_flag, []() {                       \
-        mccl_dso_handle = phi::dynload::GetNCCLDsoHandle();      \
-      });                                                        \
-      static void* p_##__name = dlsym(mccl_dso_handle, #__name); \
-      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
-    }                                                            \
-  };                                                             \
-  extern DynLoad__##__name __name
-
-#define MCCL_RAND_ROUTINE_EACH(__macro) \
-  __macro(mcclCommInitAll);             \
-  __macro(mcclGetUniqueId);             \
-  __macro(mcclCommInitRank);            \
-  __macro(mcclCommAbort);               \
-  __macro(mcclCommDestroy);             \
-  __macro(mcclCommCount);               \
-  __macro(mcclCommCuDevice);            \
-  __macro(mcclCommUserRank);            \
-  __macro(mcclAllReduce);               \
-  __macro(mcclBcast);                   \
-  __macro(mcclGroupStart);              \
-  __macro(mcclAllGather);               \
-  __macro(mcclGroupEnd);                \
-  __macro(mcclReduce);                  \
-  __macro(mcclReduceScatter);           \
-  __macro(mcclCommGetAsyncError);       \
-  __macro(mcclGetErrorString);
-
-MCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast);
-MCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
-MCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
-  __macro(mcclSend);                               \
-  __macro(mcclRecv);
-MCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-
-#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
-  __macro(mcclRedOpCreatePreMulSum);                \
-  __macro(mcclRedOpDestroy);
-MCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc
deleted file mode 100644
index fd05d45414b47e..00000000000000
--- a/paddle/phi/backends/dynload/mublas.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/mublas.h"
-
-namespace phi {
-namespace dynload {
-std::once_flag mublas_dso_flag;
-void *mublas_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2
-MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
-#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3
-MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
-#endif
-
-#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4
-MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
-#endif
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h
deleted file mode 100644
index 9f8db31bd2d060..00000000000000
--- a/paddle/phi/backends/dynload/mublas.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-
-#include <mublas.h>
-#include <musa.h>
-
-#include <mutex>  // NOLINT
-#include <type_traits>
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag mublas_dso_flag;
-extern void *mublas_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load mublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#define DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using blas_func =                                                   \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(mublas_dso_flag, []() {                                \
-        mublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
-      });                                                                   \
-      static void *p_##__name = dlsym(mublas_dso_handle, #__name);          \
-      return reinterpret_cast<blas_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern DynLoad__##__name __name
-
-#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(mublasSaxpy);                \
-  __macro(mublasDaxpy);                \
-  __macro(mublasCaxpy);                \
-  __macro(mublasZaxpy);                \
-  __macro(mublasSscal);                \
-  __macro(mublasDscal);                \
-  __macro(mublasScopy);                \
-  __macro(mublasDcopy);                \
-  __macro(mublasSgemv);                \
-  __macro(mublasDgemv);                \
-  __macro(mublasCgemv);                \
-  __macro(mublasZgemv);                \
-  __macro(mublasSgemm);                \
-  __macro(mublasDgemm);                \
-  __macro(mublasCgemm);                \
-  __macro(mublasZgemm);                \
-  __macro(mublasSgeam);                   \
-  __macro(mublasDgeam);                   \
-  __macro(mublasStrsm);                \
-  __macro(mublasDtrsm);                \
-  __macro(mublasCtrsm);                \
-  __macro(mublasZtrsm);                \
-  __macro(mublasCreate);               \
-  __macro(mublasDestroy);              \
-  __macro(mublasSetStream);            \
-  __macro(mublasSetPointerMode);       \
-  __macro(mublasGetPointerMode);       \
-  __macro(mublasSgemmBatched);            \
-  __macro(mublasDgemmBatched);            \
-  __macro(mublasCgemmBatched);            \
-  __macro(mublasZgemmBatched);            \
-  __macro(mublasStrsmBatched);            \
-  __macro(mublasDtrsmBatched);            \
-  __macro(mublasCtrsmBatched);            \
-  __macro(mublasZtrsmBatched);            
-  // __macro(mublasHgemm);                   
-  //__macro(mublasSgemmEx);                 
-  //__macro(mublasSgetrfBatched);           
-  //__macro(mublasSgetriBatched);           
-  //__macro(mublasDgetrfBatched);           
-  //__macro(mublasDgetriBatched);           
-  //__macro(mublasSmatinvBatched);
-  //__macro(mublasDmatinvBatched);          
-  //__macro(mublasSgetrsBatched);
-//  __macro(mublasDgetrsBatched);
-
-MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-#define MUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
-  __macro(mublasGemmEx);                     \
-  __macro(mublasSgemmStridedBatched);        \
-  __macro(mublasDgemmStridedBatched);        \
-  __macro(mublasCgemmStridedBatched);        \
-  __macro(mublasZgemmStridedBatched);        \
-  __macro(mublasHgemmStridedBatched);
-
-MUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-#define MUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
-  __macro(mublasSetMathMode);                \
-  __macro(mublasGetMathMode);
-
-MUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-#define MUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
-  __macro(mublasGemmBatchedEx);              
-  // __macro(mublasGemmStridedBatchedEx);
-
-MUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
-
-#undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc
deleted file mode 100644
index cd193688bc347d..00000000000000
--- a/paddle/phi/backends/dynload/mudnn.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/mudnn.h"
-
-namespace phi {
-namespace dynload {
-
-bool HasCUDNN() {
-  // note: mudnn.so is not imported by dlopen, which will be linked
-  // in cmakelist.txt.
-  return true;
-}
-
-void mudnnCreate(Handle** handle, int device) { *handle = new Handle(device); }
-
-void mudnnSetStream(Handle* handle, musaStream_t stream) {
-  handle->SetStream(stream);
-}
-
-void mudnnDestroy(Handle* handle) {
-  if (handle != nullptr) {
-    delete handle;
-    handle = nullptr;
-  }
-}
-
-}  // namespace dynload
-}  // namespace phi
-#endif
diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h
deleted file mode 100644
index d05f32a8b5df05..00000000000000
--- a/paddle/phi/backends/dynload/mudnn.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_MUSA
-#include <mudnn.h>
-#include <musa_runtime_api.h>
-
-namespace phi {
-namespace dynload {
-
-using ::musa::dnn::BatchNorm;
-using ::musa::dnn::Convolution;
-using ::musa::dnn::Handle;
-using ::musa::dnn::MemoryHandler;
-using ::musa::dnn::Pooling;
-using ::musa::dnn::Softmax;
-using ::musa::dnn::Tensor;
-
-extern bool HasCUDNN();
-
-void mudnnCreate(Handle** handle, int device);
-
-void mudnnSetStream(Handle* handle, musaStream_t stream);
-
-void mudnnDestroy(Handle* handle);
-
-}  // namespace dynload
-}  // namespace phi
-#endif
diff --git a/paddle/phi/backends/dynload/mufft.cc b/paddle/phi/backends/dynload/mufft.cc
deleted file mode 100644
index 9e30463ea39fa1..00000000000000
--- a/paddle/phi/backends/dynload/mufft.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/mufft.h"
-
-#include "paddle/phi/core/enforce.h"
-
-namespace phi {
-namespace dynload {
-std::once_flag mufft_dso_flag;
-void* mufft_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
-
-bool HasMUFFT() {
-  std::call_once(mufft_dso_flag,
-                 []() { mufft_dso_handle = GetMUFFTDsoHandle(); });
-  return mufft_dso_handle != nullptr;
-}
-
-void EnforceMUFFTLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      mufft_dso_handle,
-      phi::errors::PreconditionNotMet(
-          "Cannot load mufft shared library. Cannot invoke method %s.",
-          fn_name));
-}
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mufft.h b/paddle/phi/backends/dynload/mufft.h
deleted file mode 100644
index 70bfdd4c1efd18..00000000000000
--- a/paddle/phi/backends/dynload/mufft.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_MUSA
-#include <mufft.h>
-#include <mufftXt.h>
-#include <glog/logging.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag mufft_dso_flag;
-extern void* mufft_dso_handle;
-
-extern void EnforceMUFFTLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name)                      \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using mufft_func = decltype(&::__name);                        \
-      std::call_once(mufft_dso_flag, []() {                          \
-        mufft_dso_handle = phi::dynload::GetMUFFTDsoHandle();        \
-      });                                                            \
-      EnforceMUFFTLoaded(#__name);                                   \
-      static void* p_##__name = dlsym(mufft_dso_handle, #__name);    \
-      return reinterpret_cast<mufft_func>(p_##__name)(args...);      \
-    }                                                                \
-  };                                                                 \
-  extern struct DynLoad__##__name __name
-
-/**
- * include all needed mufft functions in HPPL
- * different mufft version has different interfaces
- **/
-#define MUFFT_FFT_ROUTINE_EACH(__macro)  \
-  __macro(mufftPlan1d);                  \
-  __macro(mufftPlan2d);                  \
-  __macro(mufftPlan3d);                  \
-  __macro(mufftPlanMany);                \
-  __macro(mufftMakePlan1d);              \
-  __macro(mufftMakePlan2d);              \
-  __macro(mufftMakePlan3d);              \
-  __macro(mufftMakePlanMany);            \
-  __macro(mufftEstimate1d);              \
-  __macro(mufftEstimate2d);              \
-  __macro(mufftEstimate3d);              \
-  __macro(mufftEstimateMany);            \
-  __macro(mufftCreate);                  \
-  __macro(mufftGetSize1d);               \
-  __macro(mufftGetSize2d);               \
-  __macro(mufftGetSize3d);               \
-  __macro(mufftGetSizeMany);             \
-  __macro(mufftGetSize);                 \
-  __macro(mufftSetWorkArea);             \
-  __macro(mufftSetAutoAllocation);       \
-  __macro(mufftExecC2C);                 \
-  __macro(mufftExecR2C);                 \
-  __macro(mufftExecC2R);                 \
-  __macro(mufftExecZ2Z);                 \
-  __macro(mufftExecD2Z);                 \
-  __macro(mufftExecZ2D);                 \
-  __macro(mufftSetStream);               \
-  __macro(mufftDestroy);                 \
-  __macro(mufftGetVersion);              \
-  __macro(mufftGetProperty);             \
-  __macro(mufftXtSetGPUs);               \
-  __macro(mufftXtMalloc);                \
-  __macro(mufftXtMemcpy);                \
-  __macro(mufftXtFree);                  \
-  __macro(mufftXtExecDescriptorC2C);     \
-  __macro(mufftXtExecDescriptorR2C);     \
-  __macro(mufftXtExecDescriptorC2R);     \
-  __macro(mufftXtExecDescriptorZ2Z);     \
-  __macro(mufftXtExecDescriptorD2Z);     \
-  __macro(mufftXtExecDescriptorZ2D);     \
-  __macro(mufftXtQueryPlan);             \
-  __macro(mufftXtSetCallback);           \
-  __macro(mufftXtClearCallback);         \
-  __macro(mufftXtMakePlanMany);          \
-  __macro(mufftXtGetSizeMany);           \
-  __macro(mufftXtExec);                  \
-  __macro(mufftXtExecDescriptor);        
-MUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUFFT_WRAP)
-
-
-inline const char *mufftGetErrorString(mufftResult_t status) {
-  switch (status) {
-    case MUFFT_SUCCESS:
-      return "'MUFFT_SUCCESS'. The mufft operation was successful.";
-    case MUFFT_INVALID_PLAN:
-      return "'MUFFT_INVALID_PLAN'. mufft was passed an invalid plan handle.";
-    case MUFFT_ALLOC_FAILED:
-      return "'MUFFT_ALLOC_FAILED'. mufft failed to allocate GPU or CPU "
-             "memory.";
-    case MUFFT_INVALID_TYPE:
-      return "'MUFFT_INVALID_TYPE'. No longer used.";
-    case MUFFT_INVALID_VALUE:
-      return "'MUFFT_INVALID_VALUE'. User specified an invalid pointer or "
-             "parameter.";
-    case MUFFT_INTERNAL_ERROR:
-      return "'MUFFT_INTERNAL_ERROR'. Driver or internal mufft library "
-             "error.";
-    case MUFFT_EXEC_FAILED:
-      return "'MUFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
-    case MUFFT_SETUP_FAILED:
-      return "'MUFFT_SETUP_FAILED'. The mufft library failed to initialize.";
-    case MUFFT_INVALID_SIZE:
-      return "'MUFFT_INVALID_SIZE'. User specified an invalid transform size.";
-    case MUFFT_UNALIGNED_DATA:
-      return "'MUFFT_UNALIGNED_DATA'. No longer used.";
-    case MUFFT_INCOMPLETE_PARAMETER_LIST:
-      return "'MUFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
-    case MUFFT_INVALID_DEVICE:
-      return "'MUFFT_INVALID_DEVICE'. Execution of a plan was on different "
-             "GPU than plan creation.";
-    case MUFFT_PARSE_ERROR:
-      return "'MUFFT_PARSE_ERROR'. Internal plan database error.";
-    case MUFFT_NO_WORKSPACE:
-      return "'MUFFT_NO_WORKSPACE'. No workspace has been provided prior to "
-             "plan execution.";
-    case MUFFT_NOT_IMPLEMENTED:
-      return "'MUFFT_NOT_IMPLEMENTED'. Function does not implement "
-             "functionality for parameters given.";
-    case MUFFT_LICENSE_ERROR:
-      return "'MUFFT_LICENSE_ERROR'. Operation is not supported for "
-             "parameters given.";
-    case MUFFT_NOT_SUPPORTED:
-      return "'MUFFT_NOT_SUPPORTED'. Operation is not supported for "
-             "parameters given.";                 
-    default:
-      return "mufft_STATUS_UNKNOWN_ERROR";
-  }
-}
-
-}  // namespace dynload
-}  // namespace phi
-
-#endif
diff --git a/paddle/phi/backends/dynload/murand.cc b/paddle/phi/backends/dynload/murand.cc
deleted file mode 100644
index bbeeb7bcd58981..00000000000000
--- a/paddle/phi/backends/dynload/murand.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/murand.h"
-
-namespace phi {
-namespace dynload {
-
-std::once_flag murand_dso_flag;
-void *murand_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h
deleted file mode 100644
index 28380cd9423f04..00000000000000
--- a/paddle/phi/backends/dynload/murand.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <murand.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-
-namespace phi {
-namespace dynload {
-extern std::once_flag murand_dso_flag;
-extern void *murand_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
-  struct DynLoad__##__name {                                        \
-    template <typename... Args>                                     \
-    murandStatus_t operator()(Args... args) {                      \
-      using murandFunc = decltype(&::__name);                      \
-      std::call_once(murand_dso_flag, []() {                       \
-        murand_dso_handle = phi::dynload::GetCurandDsoHandle();    \
-      });                                                           \
-      static void *p_##__name = dlsym(murand_dso_handle, #__name); \
-      return reinterpret_cast<murandFunc>(p_##__name)(args...);    \
-    }                                                               \
-  };                                                                \
-  extern DynLoad__##__name __name
-
-#define MURAND_RAND_ROUTINE_EACH(__macro)      \
-  __macro(murandCreateGenerator);              \
-  __macro(murandSetStream);                    \
-  __macro(murandSetPseudoRandomGeneratorSeed); \
-  __macro(murandGenerateUniform);              \
-  __macro(murandGenerateUniformDouble);        \
-  __macro(murandGenerateNormal);               \
-  __macro(murandDestroyGenerator);
-
-MURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc
deleted file mode 100644
index 2173a8d6cdd819..00000000000000
--- a/paddle/phi/backends/dynload/musa_driver.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/musa_driver.h"
-
-namespace phi {
-namespace dynload {
-
-std::once_flag musa_dso_flag;
-void* musa_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUSA_ROUTINE_EACH(DEFINE_WRAP);
-
-bool HasCUDADriver() {
-  std::call_once(musa_dso_flag, []() { musa_dso_handle = GetCUDADsoHandle(); });
-  return musa_dso_handle != nullptr;
-}
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h
deleted file mode 100644
index 3534ab8213c936..00000000000000
--- a/paddle/phi/backends/dynload/musa_driver.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <musa.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag musa_dso_flag;
-extern void* musa_dso_handle;
-extern bool HasCUDADriver();
-
-#define DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)                       \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using musa_func = decltype(&::__name);                         \
-      std::call_once(musa_dso_flag, []() {                           \
-        musa_dso_handle = phi::dynload::GetCUDADsoHandle();          \
-      });                                                            \
-      static void* p_##__name = dlsym(musa_dso_handle, #__name);     \
-      return reinterpret_cast<musa_func>(p_##__name)(args...);       \
-    }                                                                \
-  };                                                                 \
-  extern struct DynLoad__##__name __name
-
-/**
- * include all needed musa driver functions
- **/
-#define MUSA_ROUTINE_EACH(__macro)                      \
-  __macro(muInit);                                      \
-  __macro(muDriverGetVersion);                          \
-  __macro(muGetErrorString);                            \
-  __macro(muModuleLoadData);                            \
-  __macro(muModuleGetFunction);                         \
-  __macro(muModuleUnload);                              \
-  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
-  __macro(muLaunchKernel);                              \
-  __macro(muCtxCreate);                                 \
-  __macro(muCtxGetCurrent);                             \
-  __macro(muDeviceGetCount);                            \
-  __macro(muDevicePrimaryCtxGetState);                  \
-  __macro(muDeviceGetAttribute);                        \
-  __macro(muDeviceGet);
-
-MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
-
-#undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc
deleted file mode 100644
index 9cd25270a10167..00000000000000
--- a/paddle/phi/backends/dynload/musartc.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/musartc.h"
-
-namespace phi {
-namespace dynload {
-
-std::once_flag musartc_dso_flag;
-void* musartc_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
-
-bool HasNVRTC() {
-  std::call_once(musartc_dso_flag,
-                 []() { musartc_dso_handle = GetNVRTCDsoHandle(); });
-  return musartc_dso_handle != nullptr;
-}
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h
deleted file mode 100644
index ee85bebc503ec0..00000000000000
--- a/paddle/phi/backends/dynload/musartc.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-// #include <mtrtc.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-#include "paddle/phi/core/enforce.h"
-
-// TODO(MTAI): The following musa runtime compiling functions are not supported
-// now. Here empty implementations are given temporarily. When compiler MCC
-// supports these functions, we will replace them.
-typedef struct _mtrtcProgram *mtrtcProgram;
-
-typedef enum {
-  MTRTC_SUCCESS = 0,
-  MTRTC_ERROR_OUT_OF_MEMORY = 1,
-  MTRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
-  MTRTC_ERROR_INVALID_INPUT = 3,
-  MTRTC_ERROR_INVALID_PROGRAM = 4,
-  MTRTC_ERROR_INVALID_OPTION = 5,
-  MTRTC_ERROR_COMPILATION = 6,
-  MTRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
-  MTRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
-  MTRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
-  MTRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
-  MTRTC_ERROR_INTERNAL_ERROR = 11
-} mtrtcResult;
-
-inline mtrtcResult mtrtcVersion(int *major, int *minor) {
-  PADDLE_THROW(
-      phi::errors::Unimplemented("mtrtcVersion is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-inline const char *mtrtcGetErrorString(mtrtcResult result) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "mtrtcGetErrorString is not supported on MUSA now!"));
-  return "mtrtcGetErrorString is not supported on MUSA now!";
-}
-
-inline mtrtcResult mtrtcCompileProgram(mtrtcProgram prog,
-                                       int numOptions,
-                                       const char *const *options) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "mtrtcCompileProgram is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-inline mtrtcResult mtrtcCreateProgram(mtrtcProgram *prog,
-                                      const char *src,
-                                      const char *name,
-                                      int numHeaders,
-                                      const char *const *headers,
-                                      const char *const *includeNames) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "mtrtcCreateProgram is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-inline mtrtcResult mtrtcDestroyProgram(mtrtcProgram *prog) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "mtrtcDestroyProgram is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-inline mtrtcResult mtrtcGetMUSA(mtrtcProgram prog, char *musa) {
-  PADDLE_THROW(
-      phi::errors::Unimplemented("mtrtcGetMUSA is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-inline mtrtcResult mtrtcGetMUSASize(mtrtcProgram prog, size_t *musaSizeRet) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "mtrtcGetMUSASize is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-inline mtrtcResult mtrtcGetProgramLog(mtrtcProgram prog, char *log) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "mtrtcGetProgramLog is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-inline mtrtcResult mtrtcGetProgramLogSize(mtrtcProgram prog,
-                                          size_t *logSizeRet) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "mtrtcGetProgramLogSize is not supported on MUSA now!"));
-  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
-}
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag musartc_dso_flag;
-extern void *musartc_dso_handle;
-extern bool HasNVRTC();
-
-#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                      \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using musartc_func = decltype(&::__name);                      \
-      std::call_once(musartc_dso_flag, []() {                        \
-        musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle();      \
-      });                                                            \
-      static void *p_##__name = dlsym(musartc_dso_handle, #__name);  \
-      return reinterpret_cast<musartc_func>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
-  extern struct DynLoad__##__name __name
-
-/**
- * include all needed musartc functions
- **/
-#define MUSARTC_ROUTINE_EACH(__macro) \
-  __macro(mtrtcVersion);              \
-  __macro(mtrtcGetErrorString);       \
-  __macro(mtrtcCompileProgram);       \
-  __macro(mtrtcCreateProgram);        \
-  __macro(mtrtcDestroyProgram);       \
-  __macro(mtrtcGetMUSA);              \
-  __macro(mtrtcGetMUSASize);          \
-  __macro(mtrtcGetProgramLog);        \
-  __macro(mtrtcGetProgramLogSize)
-
-MUSARTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
-
-#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musparse.cc b/paddle/phi/backends/dynload/musparse.cc
deleted file mode 100644
index 40d766f963c40c..00000000000000
--- a/paddle/phi/backends/dynload/musparse.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/musparse.h"
-
-namespace phi {
-namespace dynload {
-
-std::once_flag musparse_dso_flag;
-void *musparse_dso_handle;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace phi
-
diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h
deleted file mode 100644
index e63182943190d5..00000000000000
--- a/paddle/phi/backends/dynload/musparse.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-
-#include <musa.h>
-#include <musparse.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/backends/dynload/port.h"
-
-namespace phi {
-namespace dynload {
-extern std::once_flag musparse_dso_flag;
-extern void *musparse_dso_handle;
-
-#define DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)                   \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    musparseStatus_t operator()(Args... args) {                      \
-      using Func = decltype(&::__name);                              \
-      std::call_once(musparse_dso_flag, []() {                       \
-        musparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
-      });                                                            \
-      static void *p_##__name = dlsym(musparse_dso_handle, #__name); \
-      return reinterpret_cast<Func>(p_##__name)(args...);            \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#if defined(PADDLE_WITH_MUSA)
-#define MUSPARSE_ROUTINE_EACH(__macro)   \
-  __macro(musparseCreateHandle);         \
-  __macro(musparseDestroyHandle);               \
-  __macro(musparseSetStream);            \
-  __macro(musparseCreateMatDescr);       \
-  __macro(musparseSnnz);                 \
-  __macro(musparseDnnz);                 \
-  __macro(musparseSetMatType);           \
-  __macro(musparseSetMatIndexBase);      \
-  __macro(musparseCreateCsr);            \
-  __macro(musparseCreateCoo);            \
-  __macro(musparseCreateDnMat);          \
-  __macro(musparseCreateDnVec);          \
-  __macro(musparseSpMM);                 \
-  __macro(musparseDestroySpMat);         \
-  __macro(musparseDestroyDnMat);         \
-  __macro(musparseDestroyDnVec);         \
-  __macro(musparseSpMV);                 \
-  __macro(musparseSDDMM_bufferSize);     \
-  __macro(musparseSDDMM_preprocess);     \
-  __macro(musparseSDDMM);                \
-  __macro(musparseDnMatSetStridedBatch); \
-  __macro(musparseCooSetStridedBatch);   \
-  __macro(musparseCsrSetStridedBatch);
-
-MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
-
-#endif  // PADDLE_WITH_MUSA
-
-#undef DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
-}  // namespace dynload
-}  // namespace phi
-
diff --git a/paddle/phi/backends/dynload/nccl.h b/paddle/phi/backends/dynload/nccl.h
index a5759b67e8df78..91b6f5dcd58dc5 100644
--- a/paddle/phi/backends/dynload/nccl.h
+++ b/paddle/phi/backends/dynload/nccl.h
@@ -42,18 +42,18 @@ extern void* nccl_dso_handle;
 
 #define NCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(mcclGetUniqueId);             \
+  __macro(ncclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommAbort);               \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(mcclAllReduce);               \
-  __macro(mcclBcast);                   \
-  __macro(mcclAllGather);               \
-  __macro(mcclGroupStart);              \
-  __macro(mcclGroupEnd);                \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclCommGetAsyncError);       \
@@ -67,7 +67,7 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2304
-#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
 NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h
index 651cc9c68b2438..e1018a3f253fa5 100644
--- a/paddle/phi/backends/dynload/rccl.h
+++ b/paddle/phi/backends/dynload/rccl.h
@@ -42,18 +42,18 @@ extern void* rccl_dso_handle;
 
 #define RCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(mcclGetUniqueId);             \
+  __macro(ncclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommAbort);               \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(mcclAllReduce);               \
-  __macro(mcclBcast);                   \
-  __macro(mcclAllGather);               \
-  __macro(mcclGroupStart);              \
-  __macro(mcclGroupEnd);                \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclCommGetAsyncError);       \
@@ -67,7 +67,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2304
-#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
 RCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index 2b733c01bc01b5..e1f3492f768702 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -72,25 +72,6 @@ using cufftHandle = int;
 // Forward declaration of NCCL types.
 using ncclComm_t = struct ncclComm *;
 
-
-
-
-// Forward declaration of MUSA runtime types.
-using musaStream_t = struct MUstream_st *;
-using musaEvent_t = struct MUevent_st *;
-using mublasHandle_t = struct _mublasHandle_t *;
-namespace musa {
-namespace dnn {
-struct Handle;
-}  // namespace dnn
-}  // namespace musa
-using mudnnHandle_t = musa::dnn::Handle *;
-using musparseHandle_t = struct _musparse_handle *;
-using mublasLtHandle_t = struct mublasLtContext *;
-using mcclComm_t = struct mcclComm *;
-
-
-
 /// Forward declaration of ROCM types.
 #include <cstddef>
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f250fb365ce85b..8d46c3e34cabdf 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -51,16 +51,6 @@ limitations under the License. */
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #endif  // PADDLE_WITH_HIP
 
-
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/mudnn.h"
-#include "paddle/phi/backends/dynload/mublas.h"
-#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
-#include "paddle/phi/backends/dynload/mccl.h"
-#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
-#endif  // PADDLE_WITH_MUSA
-
-
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -129,9 +119,6 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));          
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
@@ -156,11 +143,6 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void* user_data)
 #endif
-#ifdef PADDLE_WITH_MUSA
-static void StreamCallbackFunc(gpuStream_t stream,
-                               gpuError_t status,
-                               void* user_data)
-#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     static void CUDART_CB StreamCallbackFunc(void* user_data)
@@ -188,8 +170,6 @@ void DnnWorkspaceHandle::RunFuncSync(
     std::lock_guard<std::mutex> guard(*mtx_);
 #ifdef PADDLE_WITH_HIP
     auto status = hipMalloc(&workspace_ptr, size);
-#elif defined(PADDLE_WITH_MUSA)
-    auto status = musaMalloc(&workspace_ptr, size);
 #else
     auto status = cudaMalloc(&workspace_ptr, size);
 #endif
@@ -198,8 +178,6 @@ void DnnWorkspaceHandle::RunFuncSync(
       phi::backends::gpu::GpuStreamSync(stream_);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
 #endif
@@ -270,9 +248,9 @@ struct GPUContext::Impl {
       DestoryInternalWorkspace();
       DestoryInternalEigenDevice();
       phi::DestroySparseHandle(sparse_handle_);
-      // phi::DestroySolverHandle(solver_handle_);
+      phi::DestroySolverHandle(solver_handle_);
       phi::DestroyDnnHandle(dnn_handle_);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       if (nccl_comm_) {
         // NOTE(liyurui): It is not recommend calling CUDA runtime API
         // in destructor. Since we can not ensure the release order of
@@ -286,7 +264,7 @@ struct GPUContext::Impl {
       phi::DestroyBlasHandle(blas_handle_);
       phi::DestroyBlasHandle(blas_tensor_core_handle_);
       phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
-      // phi::DestroyBlasLtHandle(blaslt_handle_);
+      phi::DestroyBlasLtHandle(blaslt_handle_);
     }
     if (stream_owned_ && stream_) {
       delete stream_;
@@ -447,24 +425,24 @@ struct GPUContext::Impl {
     blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator);
   }
 
-  // void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
+  void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
 
-  // void SetBlasLtHandle(std::function<blasLtHandle_t()>&& handle_creator) {
-  //   blaslt_handle_creator_ = std::move(handle_creator);
-  // }
+  void SetBlasLtHandle(std::function<blasLtHandle_t()>&& handle_creator) {
+    blaslt_handle_creator_ = std::move(handle_creator);
+  }
 
-  // blasLtHandle_t GetBlasLtHandle() {
-  //   std::call_once(flag_blaslt_, [&]() {
-  //     if (!blaslt_handle_) {
-  //       if (!blaslt_handle_creator_)
-  //         phi::InitBlasLtHandle(&blaslt_handle_);
-  //       else
-  //         blaslt_handle_ = blaslt_handle_creator_();
-  //     }
-  //   });
-  //   PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
-  //   return blaslt_handle_;
-  // }
+  blasLtHandle_t GetBlasLtHandle() {
+    std::call_once(flag_blaslt_, [&]() {
+      if (!blaslt_handle_) {
+        if (!blaslt_handle_creator_)
+          phi::InitBlasLtHandle(&blaslt_handle_);
+        else
+          blaslt_handle_ = blaslt_handle_creator_();
+      }
+    });
+    PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
+    return blaslt_handle_;
+  }
 
   dnnHandle_t GetDnnHandle() {
     std::call_once(flag_dnn_, [&]() {
@@ -486,11 +464,6 @@ struct GPUContext::Impl {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
     }
-#elif defined(PADDLE_WITH_MUSA)
-    if (owned_ && dnn_handle_ != nullptr) {
-      phi::dynload::mudnnDestroy(dnn_handle_);
-      dnn_handle_ = nullptr;
-    }    
 #else
     if (owned_ && dnn_handle_ != nullptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_));
@@ -505,25 +478,25 @@ struct GPUContext::Impl {
     dnn_handle_creator_ = std::move(handle_creator);
   }
 
-  // solverHandle_t GetSolverHandle() {
-  //   std::call_once(flag_slover_, [&]() {
-  //     if (!solver_handle_) {
-  //       if (!solver_handle_creator_) {
-  //         phi::InitSolverHandle(&solver_handle_, stream());
-  //       } else {
-  //         solver_handle_ = solver_handle_creator_();
-  //       }
-  //     }
-  //   });
-  //   PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
-  //   return solver_handle_;
-  // }
+  solverHandle_t GetSolverHandle() {
+    std::call_once(flag_slover_, [&]() {
+      if (!solver_handle_) {
+        if (!solver_handle_creator_) {
+          phi::InitSolverHandle(&solver_handle_, stream());
+        } else {
+          solver_handle_ = solver_handle_creator_();
+        }
+      }
+    });
+    PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
+    return solver_handle_;
+  }
 
-  // void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
+  void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
 
-  // void SetSolverHandle(std::function<solverHandle_t()>&& handle_creator) {
-  //   solver_handle_creator_ = std::move(handle_creator);
-  // }
+  void SetSolverHandle(std::function<solverHandle_t()>&& handle_creator) {
+    solver_handle_creator_ = std::move(handle_creator);
+  }
 
   sparseHandle_t GetSparseHandle() {
     std::call_once(flag_sparse_, [&]() {
@@ -556,9 +529,6 @@ struct GPUContext::Impl {
       break;
     }
 #endif  // !defined(_WIN32)
-#elif defined(PADDLE_WITH_MUSA)
-    musaError_t e_sync = musaSuccess;
-    e_sync = musaStreamSynchronize(stream());
 #else   // PADDLE_WITH_HIP
     cudaError_t e_sync = cudaSuccess;
 #if !defined(_WIN32)
@@ -577,23 +547,21 @@ struct GPUContext::Impl {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0));
 #endif
   }
 
-  mcclComm_t GetNcclComm() const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+  ncclComm_t GetNcclComm() const {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
     return nccl_comm_;
 #endif
     return nullptr;
   }
 
-  void SetNcclComm(mcclComm_t comm) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+  void SetNcclComm(ncclComm_t comm) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     nccl_comm_ = comm;
 #endif
   }
@@ -710,8 +678,6 @@ struct GPUContext::Impl {
   void RecordEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream()));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream()));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream()));
 #endif
@@ -734,12 +700,6 @@ struct GPUContext::Impl {
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -752,7 +712,7 @@ struct GPUContext::Impl {
   }
 
   void WaitStreamCallback() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
     phi::backends::gpu::GpuStreamSync(stream());
 #endif
     {
@@ -804,12 +764,12 @@ struct GPUContext::Impl {
   std::function<blasHandle_t()> blas_tensor_core_handle_creator_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
   std::function<blasHandle_t()> blas_tf32_tensor_core_handle_creator_{nullptr};
-  // blasLtHandle_t blaslt_handle_{nullptr};
-  // std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
+  std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
   std::function<dnnHandle_t()> dnn_handle_creator_{nullptr};
-  // solverHandle_t solver_handle_{nullptr};
-  // std::function<solverHandle_t()> solver_handle_creator_{nullptr};
+  solverHandle_t solver_handle_{nullptr};
+  std::function<solverHandle_t()> solver_handle_creator_{nullptr};
   sparseHandle_t sparse_handle_{nullptr};
   std::function<sparseHandle_t()> sparse_handle_creator_{nullptr};
   DnnWorkspaceHandle* workspace_{nullptr};
@@ -823,7 +783,7 @@ struct GPUContext::Impl {
   std::once_flag flag_tensorcore_cublas_;
   std::once_flag flag_eigen_device_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs
   // both within and across nodes.
@@ -832,7 +792,7 @@ struct GPUContext::Impl {
 
   // NOTE: Distributed communicator, distributed framework manages its
   // resources.
-  mcclComm_t nccl_comm_{nullptr};
+  ncclComm_t nccl_comm_{nullptr};
 #endif
 
   mutable std::mutex blas_mtx_;
@@ -879,13 +839,13 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
-// blasLtHandle_t GPUContext::cublaslt_handle() const {
-//   return impl_->GetBlasLtHandle();
-// }
+blasLtHandle_t GPUContext::cublaslt_handle() const {
+  return impl_->GetBlasLtHandle();
+}
 
-// solverHandle_t GPUContext::cusolver_dn_handle() const {
-//   return impl_->GetSolverHandle();
-// }
+solverHandle_t GPUContext::cusolver_dn_handle() const {
+  return impl_->GetSolverHandle();
+}
 
 sparseHandle_t GPUContext::cusparse_handle() const {
   return impl_->GetSparseHandle();
@@ -954,9 +914,9 @@ void GPUContext::AddStreamCallback(
 
 void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); }
 
-mcclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
+ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
 
-void GPUContext::set_nccl_comm(mcclComm_t comm) { impl_->SetNcclComm(comm); }
+void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); }
 
 void GPUContext::Init() {
   impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());  // NOLINT
@@ -1005,13 +965,13 @@ void GPUContext::SetBlasTF32Handle(std::function<blasHandle_t()>&& func) {
   impl_->SetBlasTF32Handle(std::move(func));
 }
 
-// void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
-//   impl_->SetBlasLtHandle(blaslt);
-// }
+void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
+  impl_->SetBlasLtHandle(blaslt);
+}
 
-// void GPUContext::SetBlasLtHandle(std::function<blasLtHandle_t()>&& func) {
-//   impl_->SetBlasLtHandle(std::move(func));
-// }
+void GPUContext::SetBlasLtHandle(std::function<blasLtHandle_t()>&& func) {
+  impl_->SetBlasLtHandle(std::move(func));
+}
 
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
@@ -1021,13 +981,13 @@ void GPUContext::SetDnnHandle(std::function<dnnHandle_t()>&& func) {
   impl_->SetDnnHandle(std::move(func));
 }
 
-// void GPUContext::SetSolverHandle(solverHandle_t handle) {
-//   impl_->SetSolverHandle(handle);
-// }
+void GPUContext::SetSolverHandle(solverHandle_t handle) {
+  impl_->SetSolverHandle(handle);
+}
 
-// void GPUContext::SetSolverHandle(std::function<solverHandle_t()>&& func) {
-//   impl_->SetSolverHandle(std::move(func));
-// }
+void GPUContext::SetSolverHandle(std::function<solverHandle_t()>&& func) {
+  impl_->SetSolverHandle(std::move(func));
+}
 
 void GPUContext::SetSparseHandle(sparseHandle_t handle) {
   impl_->SetSparseHandle(handle);
@@ -1086,7 +1046,7 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) {
 
 void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 GPUPinnedContext::GPUPinnedContext() {
   eigen_device_ = std::make_unique<Eigen::DefaultDevice>();
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 19eb5dd05cd3c1..8cd0d414bc105b 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU_KP)
 
 #include <array>
@@ -109,10 +109,10 @@ class PADDLE_API GPUContext : public DeviceContext,
   blasHandle_t cublas_handle() const;
 
   /*! \brief  Return cublasLt handle in the device context. */
-  // blasLtHandle_t cublaslt_handle() const;
+  blasLtHandle_t cublaslt_handle() const;
 
   /*! \brief  Return cusolver handle in the device context. */
-  // solverHandle_t cusolver_dn_handle() const;
+  solverHandle_t cusolver_dn_handle() const;
 
   /*! \brief  Return cusparse handle in the device context. */
   sparseHandle_t cusparse_handle() const;
@@ -183,10 +183,10 @@ class PADDLE_API GPUContext : public DeviceContext,
 
  public:
   /*! \brief  Return nccl communicators. */
-  mcclComm_t nccl_comm() const;
+  ncclComm_t nccl_comm() const;
 
   /*! \brief  Set nccl communicators. */
-  void set_nccl_comm(mcclComm_t comm);
+  void set_nccl_comm(ncclComm_t comm);
 
  public:
   // NOTE: DeviceContext hold resources. Used in training scenarios.
@@ -232,14 +232,14 @@ class PADDLE_API GPUContext : public DeviceContext,
   void SetBlasTF32Handle(blasHandle_t);
   void SetBlasTF32Handle(std::function<blasHandle_t()>&&);
 
-  // void SetBlasLtHandle(blasLtHandle_t);
-  // void SetBlasLtHandle(std::function<blasLtHandle_t()>&&);
+  void SetBlasLtHandle(blasLtHandle_t);
+  void SetBlasLtHandle(std::function<blasLtHandle_t()>&&);
 
   void SetDnnHandle(dnnHandle_t);
   void SetDnnHandle(std::function<dnnHandle_t()>&&);
 
-  // void SetSolverHandle(solverHandle_t);
-  // void SetSolverHandle(std::function<solverHandle_t()>&&);
+  void SetSolverHandle(solverHandle_t);
+  void SetSolverHandle(std::function<solverHandle_t()>&&);
 
   void SetSparseHandle(sparseHandle_t);
   void SetSparseHandle(std::function<sparseHandle_t()>&&);
@@ -276,7 +276,7 @@ using GPUDNNContext = GPUContext;
 // because we want to implement a KPS-based kernel and make it run
 // on GPU and XPU at the same time, so we need KPSContext when registering
 // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using KPSContext = GPUContext;
 #endif
 
@@ -287,7 +287,7 @@ struct DefaultDevice;
 }  // namespace Eigen
 
 namespace phi {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // Currently, GPUPinnedContext is only used to data copying.
 class GPUPinnedContext
     : public DeviceContext,
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index e791326d71fd49..4a6b9d2fd87f13 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -16,66 +16,57 @@
 #pragma once
 
 #include "paddle/phi/backends/gpu/forwards.h"
-// #include "mudnn/export/c/mudnn_compatible.h"
+
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
-#elif defined(PADDLE_WITH_MUSA)
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
-  using GPU_TYPE = MUSA_TYPE;
+
 #else  // PADDLE_WITH_CDUA
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
 
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t,musaStream_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t,musaEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
 
-// DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
-//                      cudnnActivationStruct,
-//                      miopenActivationDescriptor,
-//                      mudnnActivationStruct);
-// DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
-//                      cudnnTensorStruct,
-//                      miopenTensorDescriptor,
-//                      mudnnTensorStruct);
-// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
-//                      cudnnFilterStruct,
-//                      miopenTensorDescriptor,
-//                      mudnnFilterStruct);
-// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
-//                      cudnnFilterDescriptor_t,
-//                      miopenTensorDescriptor_t,
-//                      mudnnFilterDescriptor_t);
-// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
-//                      cudnnConvolutionStruct,
-//                      miopenConvolutionDescriptor,
-//                      mudnnConvolutionStruct);
-// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
-//                      cudnnConvolutionDescriptor_t,
-//                      miopenConvolutionDescriptor_t,
-//                      mudnnConvolutionDescriptor_t);
-// DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
-//                      cudnnPoolingDescriptor_t,
-//                      miopenPoolingDescriptor_t,
-//                      mudnnPoolingDescriptor_t);
-// DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
-//                      cudnnDropoutDescriptor_t,
-//                      miopenDropoutDescriptor_t,
-//                      mudnnDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t);
+DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+                     cudnnActivationStruct,
+                     miopenActivationDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+                     cudnnTensorStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+                     cudnnFilterStruct,
+                     miopenTensorDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+                     cudnnFilterDescriptor_t,
+                     miopenTensorDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+                     cudnnConvolutionStruct,
+                     miopenConvolutionDescriptor);
+DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+                     cudnnConvolutionDescriptor_t,
+                     miopenConvolutionDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+                     cudnnPoolingDescriptor_t,
+                     miopenPoolingDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+                     cudnnDropoutDescriptor_t,
+                     miopenDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
 
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t);
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
-// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t);
+DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 
-// DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t);
+DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
 
-DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t);
+DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
 
 #undef DECLARE_TYPE_FOR_GPU
 
diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h
index 5c0c475b140ff0..0f79e2a645ab34 100644
--- a/paddle/phi/backends/gpu/gpu_device_function.h
+++ b/paddle/phi/backends/gpu/gpu_device_function.h
@@ -13,12 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/backends/gpu/musa/musa_device_function.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h
index 30cf3fae80519b..f37afa3deeb746 100644
--- a/paddle/phi/backends/gpu/gpu_dnn.h
+++ b/paddle/phi/backends/gpu/gpu_dnn.h
@@ -14,14 +14,11 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_desc.h"
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/backends/gpu/musa/mudnn_desc.h"
-#include "paddle/phi/backends/gpu/musa/mudnn_helper.h"
 #else  // CUDA
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h
index 8afa826408cb7a..2353b42794ffdd 100644
--- a/paddle/phi/backends/gpu/gpu_helper.h
+++ b/paddle/phi/backends/gpu/gpu_helper.h
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_helper.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/backends/gpu/musa/musa_helper.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 2d1b7c1a98f27f..ebf57bd06eb19d 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <stddef.h>
 
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index 4e300a3031a258..fd712baf754803 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -16,12 +16,10 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
-#elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index 98ebea87eedfd8..b9c49cb5696633 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -16,10 +16,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -147,7 +143,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
       static_cast<unsigned long long int>(val));            // NOLINT
 }
 
-#if defined(__HIPCC__)  || defined(__MUSACC__)|| (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
@@ -399,12 +395,188 @@ CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
                          CudaAtomicAdd(imag, val.imag));
 }
 
+// For atomicMul.
+CUDA_ATOMIC_WRAPPER(Mul, int) {
+  int res = *address, old = res;  // NOLINT
+  do {
+    old = res;
+    res = atomicCAS(address,     // NOLINT
+                    old,         // NOLINT
+                    val * old);  // NOLINT
+  } while (old != res);
+  return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, unsigned int) {
+  unsigned int res = *address, old = res;  // NOLINT
+  do {
+    old = res;
+    res = atomicCAS(address,     // NOLINT
+                    old,         // NOLINT
+                    val * old);  // NOLINT
+  } while (old != res);
+  return res;
+}
+// CUDA API uses unsigned long long int, we cannot use uint64_t here.
+// It because unsigned long long int is not necessarily uint64_t
+CUDA_ATOMIC_WRAPPER(Mul, unsigned long long int) {  // NOLINT
+  unsigned long long int old = *address, assumed;   // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address, assumed, val * assumed);
+  } while (assumed != old);
+  return old;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, int64_t) {
+  // Here, we check long long int must be int64_t.
+  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
+                "long long should be int64");
+  long long int res = *address, old = res;  // NOLINT
+  do {
+    old = res;
+    res = (long long int)atomicCAS(                                  // NOLINT
+        (unsigned long long int *)address,                           // NOLINT
+        (unsigned long long int)old,                                 // NOLINT
+        (unsigned long long int)val * (unsigned long long int)old);  // NOLINT
+  } while (old != res);
+  return res;
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, float) {
+  int *const address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_i, assumed, __float_as_int(val * __int_as_float(assumed)));
+  } while (assumed != old);
+
+  return __int_as_float(old);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, double) {
+  unsigned long long int *const address_as_ull =            // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
+
+  do {
+    assumed = old;
+
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val * __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+
+#ifdef PADDLE_CUDA_FP16
+inline static __device__ uint32_t mul_to_low_half(uint32_t val, float x) {
+  phi::dtype::float16 low_half;
+  // The float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<phi::dtype::float16>(static_cast<float>(low_half) * x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t mul_to_high_half(uint32_t val, float x) {
+  phi::dtype::float16 high_half;
+  // The float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::float16>(static_cast<float>(high_half) * x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::float16) {
+  if (*address >= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, mul_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::float16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, mul_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
+inline static __device__ uint32_t bf16_mul_to_low_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 low_half;
+  // The bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(low_half) * x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_mul_to_high_half(uint32_t val, float x) {
+  phi::dtype::bfloat16 high_half;
+  // The bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half =
+      static_cast<phi::dtype::bfloat16>(static_cast<float>(high_half) * x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Mul, phi::dtype::bfloat16) {
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_mul_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_mul_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 // For atomicMax
 USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__)
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
@@ -590,7 +762,7 @@ USE_CUDA_ATOMIC(Min, int);
 USE_CUDA_ATOMIC(Min, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__)
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 89471ba29aee00..a29b5e110922a4 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -37,10 +37,6 @@
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/musparse.h"
-#endif
-
 #include "glog/logging.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -68,9 +64,10 @@ void InitGpuProperties(Place place,
   *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId());
   *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
 
-#ifdef PADDLE_WITH_CUDA
   const gpuDeviceProp& prop =
       backends::gpu::GetDeviceProperties(place.GetDeviceId());
+
+#ifdef PADDLE_WITH_CUDA
   static const std::set<int> compiled_archs{CUDA_REAL_ARCHS};
   // Make sure compiled cuda arch is as same as runtime cuda arch.
   if (compiled_archs.find(*compute_capability) == compiled_archs.cend() &&
@@ -118,17 +115,6 @@ void InitGpuProperties(Place place,
   }
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-  LOG_FIRST_N(INFO, 1) << "Please NOTE: device: "
-                       << static_cast<int>(place.device)
-                       << ", GPU Compute Capability: "
-                       << *compute_capability / 10 << "."
-                       << *compute_capability % 10
-                       << ", Driver API Version: " << *driver_version / 10000
-                       << "." << (*driver_version % 10000) / 100
-                       << ", Runtime API Version: " << *runtime_version / 10000
-                       << "." << (*runtime_version % 10000) / 100;
-#else
   // TODO(wilber): glog may be replaced in the future?
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
                           << static_cast<int>(place.device)
@@ -140,7 +126,6 @@ void InitGpuProperties(Place place,
                           << ", Runtime API Version: "
                           << *runtime_version / 1000 << "."
                           << (*runtime_version % 100) / 10;
-#endif
 #ifdef PADDLE_WITH_HIP
   size_t miopen_major, miopen_minor, miopen_patch;
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -159,62 +144,42 @@ void InitGpuProperties(Place place,
         << "Please recompile or reinstall Paddle with compatible MIOPEN "
            "version.";
   }
-#elif defined(PADDLE_WITH_MUSA)
-  // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now.
-  // Requests have been submitted to Mudnn.
-  // size_t mudnn_dso_ver = dynload::mudnnGetVersion();
-  size_t mudnn_dso_ver = 2500;
-  LOG_FIRST_N(INFO, 1) << "device: " << static_cast<int>(place.device)
-                       << ", muDNN Version: " << mudnn_dso_ver / 1000 << "."
-                       << (mudnn_dso_ver % 1000) / 100 << ".";
-
-  // Check MUSA/MUDNN version compatiblity
-  auto local_musa_version = *driver_version;
-  int compile_musa_version = MUSA_VERSION;
-#if defined(__linux__)
-  PADDLE_ENFORCE_EQ(
-      (local_musa_version / 100 < compile_musa_version / 100) &&
-          (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
-      false,
-      phi::errors::InvalidArgument(
-          "The installed Paddle is compiled with MUSA%d/muDNN%d,"
-          "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
-          "which will cause serious incompatible bug. "
-          "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
-          "version.",
-          compile_musa_version / 10000,
-          MUDNN_VERSION / 1000,
-          local_musa_version / 10000,
-          mudnn_dso_ver / 1000));
-#endif
-  if (local_musa_version < compile_musa_version) {
-    LOG_FIRST_N(WARNING, 1)
-        << "WARNING: device: " << static_cast<int>(place.device)
-        << ". The installed Paddle is compiled with MUSA "
-        << compile_musa_version / 10000 << "."
-        << (compile_musa_version % 1000) / 100
-        << ", but MUSA runtime version in your machine is "
-        << local_musa_version / 10000 << "."
-        << (local_musa_version % 1000) / 100
-        << ", which may cause serious incompatible bug. "
-        << "Please recompile or reinstall Paddle with compatible MUSA "
-           "version.";
-  }
 #else
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  auto get_cudnn_major = [](auto version) {
+    if (version < 9000) {
+      return version / 1000;
+    }
+    // CUDNN changes the CUDNN_VERSION rules after 9.0
+    return version / 10000;
+  };
+  auto get_cudnn_minor = [](auto version) {
+    if (version < 9000) {
+      return (version % 1000) / 100;
+    }
+    // CUDNN changes the CUDNN_VERSION rules after 9.0
+    return (version % 10000) / 100;
+  };
+
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
-                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
-                          << (cudnn_dso_ver % 1000) / 100 << ".";
+                          << ", cuDNN Version: "
+                          << get_cudnn_major(cudnn_dso_ver) << "."
+                          << get_cudnn_minor(cudnn_dso_ver) << ".";
 
   // Check CUDA/CUDNN version compatiblity
   auto local_cuda_version =
       (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
   auto compile_cuda_version =
       (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+
+  // Compute cuDNN major
+  auto local_cudnn_major = get_cudnn_major(cudnn_dso_ver);
+  size_t compile_cudnn_major = CUDNN_MAJOR;
+
 #if defined(__linux__)
   PADDLE_ENFORCE_EQ(
       (local_cuda_version / 10 < compile_cuda_version / 10) &&
-          (cudnn_dso_ver / 1000 < CUDNN_VERSION / 1000),
+          (local_cudnn_major < compile_cudnn_major),
       false,
       phi::errors::InvalidArgument(
           "The installed Paddle is compiled with CUDA%d/cuDNN%d,"
@@ -223,9 +188,9 @@ void InitGpuProperties(Place place,
           "Please recompile or reinstall Paddle with compatible CUDA/cuDNN "
           "version.",
           compile_cuda_version / 10,
-          CUDNN_VERSION / 1000,
+          compile_cudnn_major,
           local_cuda_version / 10,
-          cudnn_dso_ver / 1000));
+          local_cudnn_major));
 #endif
   if (local_cuda_version < compile_cuda_version) {
     LOG_FIRST_N(WARNING, 1)
@@ -241,14 +206,10 @@ void InitGpuProperties(Place place,
 #endif
 }
 
-
 void InitStream(gpuStream_t* stream) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamCreateWithPriority(stream, hipStreamDefault, 0));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaStreamCreateWithPriority(stream, musaStreamDefault, 0));      
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0));
@@ -259,8 +220,6 @@ void DestoryStream(gpuStream_t stream) {
   if (stream != nullptr) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -272,9 +231,6 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
   phi::dynload::rocblas_create_handle(blas_handle);
   phi::dynload::rocblas_set_stream(*blas_handle, stream);
-#elif defined(PADDLE_WITH_MUSA)
-  phi::dynload::mublasCreate(blas_handle);
-  phi::dynload::mublasSetStream(*blas_handle, stream);
 #else   // PADDLE_WITH_CUDA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
@@ -288,11 +244,6 @@ void DestroyBlasHandle(blasHandle_t handle) {
     phi::dynload::rocblas_destroy_handle(handle);
     handle = nullptr;
   }
-#elif defined(PADDLE_WITH_MUSA)
-  if (handle != nullptr) {
-    phi::dynload::mublasDestroy(handle);
-    handle = nullptr;
-  }
 #else
   if (handle != nullptr) {
     phi::dynload::cublasDestroy(handle);
@@ -301,20 +252,20 @@ void DestroyBlasHandle(blasHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
-// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
-// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-//   phi::dynload::cublasLtCreate(blaslt_handle);
-// #endif
-// }
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  phi::dynload::cublasLtCreate(blaslt_handle);
+#endif
+}
 
-// void DestroyBlasLtHandle(blasLtHandle_t handle) {
-// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-//   if (handle != nullptr) {
-//     phi::dynload::cublasLtDestroy(handle);
-//     handle = nullptr;
-//   }
-// #endif
-// }
+void DestroyBlasLtHandle(blasLtHandle_t handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  if (handle != nullptr) {
+    phi::dynload::cublasLtDestroy(handle);
+    handle = nullptr;
+  }
+#endif
+}
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
   if (phi::dynload::HasCUDNN()) {
@@ -338,9 +289,6 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
-#elif defined(PADDLE_WITH_MUSA)
-    phi::dynload::mudnnCreate(handle, place.device);
-    phi::dynload::mudnnSetStream(*handle, stream);
 #else
     auto version = phi::dynload::cudnnGetVersion();
     auto local_cudnn_major =
@@ -371,11 +319,6 @@ void DestroyDnnHandle(dnnHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
     handle = nullptr;
   }
-#elif defined(PADDLE_WITH_MUSA)
-  if (handle != nullptr) {
-    phi::dynload::mudnnDestroy(handle);
-    handle = nullptr;
-  }  
 #else
   if (handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
@@ -384,21 +327,21 @@ void DestroyDnnHandle(dnnHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
-// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
-// #if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
-//   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
-//   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
-// #endif
-// }
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
+#ifndef PADDLE_WITH_HIP
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
+#endif
+}
 
-// void DestroySolverHandle(solverHandle_t solver_handle) {
-// #if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
-//   if (solver_handle != nullptr) {
-//     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
-//     solver_handle = nullptr;
-//   }
-// #endif
-// }
+void DestroySolverHandle(solverHandle_t solver_handle) {
+#ifndef PADDLE_WITH_HIP
+  if (solver_handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
+    solver_handle = nullptr;
+  }
+#endif
+}
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 // ROCM is not yet supported
@@ -411,9 +354,6 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 #elif defined(PADDLE_WITH_HIP)
   phi::dynload::rocsparse_create_handle(handle);
   phi::dynload::rocsparse_set_stream(*handle, stream);
-#elif defined(PADDLE_WITH_MUSA)
-  phi::dynload::musparseCreateHandle(handle);
-  phi::dynload::musparseSetStream(*handle, stream);  
 #endif
 }
 
@@ -430,11 +370,6 @@ void DestroySparseHandle(sparseHandle_t handle) {
     phi::dynload::rocsparse_destroy_handle(handle);
     handle = nullptr;
   }
-#elif defined(PADDLE_WITH_MUSA)
-  if (handle != nullptr) {
-    phi::dynload::musparseDestroyHandle(handle);
-    handle = nullptr;
-  }
 #endif
 }
 
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
index df6a131ff315d7..7bec5eebf5886f 100644
--- a/paddle/phi/backends/gpu/gpu_resources.h
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -35,14 +35,14 @@ void DestoryStream(gpuStream_t stream);
 void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream);
 void DestroyBlasHandle(blasHandle_t handle);
 
-// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
-// void DestroyBlasLtHandle(blasLtHandle_t handle);
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
+void DestroyBlasLtHandle(blasLtHandle_t handle);
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
 void DestroyDnnHandle(dnnHandle_t handle);
 
-// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
-// void DestroySolverHandle(solverHandle_t solver_handle);
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
+void DestroySolverHandle(solverHandle_t solver_handle);
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream);
 void DestroySparseHandle(sparseHandle_t handle);
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 00c0bdf6c545bc..77f403795b6b3d 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -17,15 +17,11 @@
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/backends/dynload/mublas.h"
-#include "paddle/phi/backends/dynload/mudnn.h"
 #else  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
@@ -33,40 +29,19 @@
 
 namespace phi {
 
-#ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
-  using GPU_TYPE = ROCM_TYPE;
-
-#elif defined(PADDLE_WITH_MUSA)
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
-  using GPU_TYPE = MUSA_TYPE;
-
-#else  // PADDLE_WITH_MUSA
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
-  using GPU_TYPE = CUDA_TYPE;
-#endif  // PADDLE_WITH_CUDA
-
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind,
-                     cudaMemcpyKind,
-                     hipMemcpyKind,
-                     musaMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp,
-                     cudaDeviceProp,
-                     hipDeviceProp_t,
-                     musaDeviceProp);
-#undef DECLARE_TYPE_FOR_GPU
-
-#ifndef PADDLE_WITH_MUSA
 #ifdef PADDLE_WITH_HIP
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
 
-#else  // PADDLE_WITH_MUSA
+#else  // PADDLE_WITH_CDUA
+
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
-#endif  // PADDLE_WITH_CUDA
+#endif
 
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
 DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
 DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
 DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
@@ -75,45 +50,34 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
 DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
                      cudnnActivationMode_t,
                      miopenActivationMode_t);
+
 #undef DECLARE_TYPE_FOR_GPU
-#endif
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = ROCM_CV;
-#elif defined(PADDLE_WITH_MUSA)
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
-  constexpr auto GPU_CV = MUSA_CV;
 #else  // PADDLE_WITH_CUDA
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory,
-                         musaErrorMemoryAllocation);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady,
-                         cudaErrorNotReady,
-                         hipErrorNotReady,
-                         musaErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
+                         hipErrorOutOfMemory);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
 
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice,
                          cudaMemcpyKind::cudaMemcpyHostToDevice,
-                         hipMemcpyKind::hipMemcpyHostToDevice,
-                         musaMemcpyKind::musaMemcpyHostToDevice);
+                         hipMemcpyKind::hipMemcpyHostToDevice);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost,
                          cudaMemcpyKind::cudaMemcpyDeviceToHost,
-                         hipMemcpyKind::hipMemcpyDeviceToHost,
-                         musaMemcpyKind::musaMemcpyDeviceToHost);
+                         hipMemcpyKind::hipMemcpyDeviceToHost);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
                          cudaMemcpyKind::cudaMemcpyDeviceToDevice,
-                         hipMemcpyKind::hipMemcpyDeviceToDevice,
-                         musaMemcpyKind::musaMemcpyDeviceToDevice);
+                         hipMemcpyKind::hipMemcpyDeviceToDevice);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace phi
 
-#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||
-        // defined(PADDLE_WITH_MUSA )
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/backends/gpu/musa/mudnn_desc.h b/paddle/phi/backends/gpu/musa/mudnn_desc.h
deleted file mode 100644
index 9de12d586bea01..00000000000000
--- a/paddle/phi/backends/gpu/musa/mudnn_desc.h
+++ /dev/null
@@ -1,202 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <algorithm>
-#include <functional>
-#include <iostream>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <vector>
-#include "paddle/phi/backends/gpu/musa/mudnn_helper.h"
-#include "paddle/phi/core/utils/data_type.h"
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-template <typename T>
-inline std::vector<T> TransformDimOrder(const std::vector<T>& dims) {
-  std::vector<T> transformed_dims(dims.begin(), dims.end());
-  if (dims.size() < 4) {
-    return transformed_dims;
-  }
-  T H, W, D, C;
-  if (dims.size() == 4) {
-    H = dims[1];
-    W = dims[2];
-    C = dims[3];
-    transformed_dims[1] = C;
-    transformed_dims[2] = H;
-    transformed_dims[3] = W;
-  } else {
-    D = dims[1];
-    H = dims[2];
-    W = dims[3];
-    C = dims[4];
-    transformed_dims[1] = C;
-    transformed_dims[2] = D;
-    transformed_dims[3] = H;
-    transformed_dims[4] = W;
-  }
-  return transformed_dims;
-}
-
-inline dynload::Tensor::Type ToCudnnDataType(const phi::DataType& t) {
-  dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT;
-  switch (t) {
-    case phi::DataType::FLOAT16:
-      type = dynload::Tensor::Type::HALF;
-      break;
-    case phi::DataType::FLOAT32:
-      type = dynload::Tensor::Type::FLOAT;
-      break;
-    case phi::DataType::FLOAT64:
-      type = dynload::Tensor::Type::DOUBLE;
-      break;
-    default:
-      PD_THROW("Don't support this data type ", t);
-  }
-  return type;
-}
-
-class TensorDescriptor {
- public:
-  using T = dynload::Tensor;
-  TensorDescriptor() : desc_(std::make_unique<T>()) {}
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-  void set(const phi::DenseTensor& tensor, const int groups = 1) {
-    auto dims = phi::vectorize<int64_t>(tensor.dims());
-    std::vector<int64_t> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    desc_->SetType(ToCudnnDataType(tensor.dtype()));
-    desc_->SetNdInfo(static_cast<int>(dims.size()), dims.data(), strides.data());
-    desc_->SetAddr(tensor.data());
-  }
-
-  template <typename Type>
-  void set(const phi::DenseTensor& tensor, const Type* data) {
-    auto dims = phi::vectorize<int64_t>(tensor.dims());
-    std::vector<int64_t> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    desc_->SetType(ToCudnnDataType(tensor.dtype()));
-    desc_->SetNdInfo(static_cast<int>(dims.size()), dims.data(), strides.data());
-    desc_->SetAddr(data);
-  }
-
-  void set(const std::vector<int>& dims,
-           const dynload::Tensor::Format format,
-           const dynload::Tensor::Type dtype) {
-    std::vector<int64_t> transformed_dims;
-    std::vector<int64_t> dims_64(dims.begin(), dims.end());
-    if (format == dynload::Tensor::Format::NHWC) {
-      transformed_dims = TransformDimOrder(dims_64);
-    } else {
-      transformed_dims = dims_64;
-    }
-    desc_->SetFormat(format);
-    desc_->SetType(dtype);
-    desc_->SetNdInfo(static_cast<int>(transformed_dims.size()), transformed_dims.data());
-  }
-
-  void set(const phi::DenseTensor& tensor,
-           const dynload::Tensor::Format format) {
-    auto dims = phi::vectorize<int>(tensor.dims());
-    auto dtype = ToCudnnDataType(tensor.dtype());
-    set(dims, format, dtype);
-    desc_->SetAddr(tensor.data());
-  }
-
- private:
-  std::unique_ptr<T> desc_;
-};
-
-class FilterDescriptor {
- public:
-  using T = phi::dynload::Tensor;
-  FilterDescriptor() : desc_(std::make_unique<T>()) {}
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(const std::vector<int>& dims,
-           const dynload::Tensor::Format format,
-           const dynload::Tensor::Type dtype,
-           const int groups = 1) {
-    std::vector<int64_t> transformed_dims;
-    std::vector<int64_t> dims_64(dims.begin(), dims.end());
-    if (format == dynload::Tensor::Format::NHWC) {
-      transformed_dims = TransformDimOrder(dims_64);
-    } else {
-      transformed_dims = dims_64;
-    }
-    if (groups > 1) {
-      transformed_dims[1] = transformed_dims[1] / groups;
-    }
-    desc_->SetFormat(format);
-    desc_->SetType(dtype);
-    desc_->SetNdInfo(static_cast<int>(transformed_dims.size()), transformed_dims.data());
-  }
-
-  void set(const phi::DenseTensor& tensor,
-           const dynload::Tensor::Format format,
-           const int groups = 1) {
-    auto dims = phi::vectorize<int>(tensor.dims());
-    auto dtype = ToCudnnDataType(tensor.dtype());
-    set(dims, format, dtype, groups);
-    desc_->SetAddr(tensor.data());
-  }
-
- private:
-  std::unique_ptr<T> desc_;
-};
-
-class ConvolutionDescriptor {
- public:
-  using T = dynload::Convolution;
-  ConvolutionDescriptor() : desc_(std::make_unique<T>()) {}
-  T* desc() { return desc_.get(); }
-  T* desc() const { return desc_.get(); }
-
-  void set(dynload::Tensor::Type dtype,
-           const std::vector<int>& pads,
-           const std::vector<int>& strides,
-           const std::vector<int>& dilations,
-           bool allow_tf32,
-           const int groups = 1) {
-    allow_tf32_ = allow_tf32;
-    desc_->SetNdInfo(
-        pads.size(), pads.data(), strides.data(), dilations.data());
-    desc_->SetComputeMode(dynload::Convolution::ComputeMode::TENSOR);
-    desc_->SetGroups(groups);
-  }
-
-  bool allow_tf32_;
-
- private:
-  std::unique_ptr<T> desc_;
-};
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/mudnn_helper.h b/paddle/phi/backends/gpu/musa/mudnn_helper.h
deleted file mode 100644
index 55030e860b4213..00000000000000
--- a/paddle/phi/backends/gpu/musa/mudnn_helper.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-#include <vector>
-
-#include "gflags/gflags.h"
-#include "paddle/phi/backends/dynload/mudnn.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/enforce.h"
-
-#define CUDNN_BN_MIN_EPSILON 1e-05
-
-DECLARE_bool(cudnn_deterministic);
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-#define CUDNN_VERSION_MIN(major, minor, patch) \
-  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
-
-enum class DataLayout {  // Not use
-  kNHWC,
-  kNCHW,
-  kNCDHW,
-  kNDHWC,  // add, liyamei
-  kNCHW_VECT_C,
-};
-
-enum class PoolingMode {
-  kMaximum,
-  kMaximumDeterministic,
-  kAverageExclusive,
-  kAverageInclusive,
-};
-
-inline dynload::Pooling::Mode GetPoolingMode(const PoolingMode& mode) {
-  switch (mode) {
-    // case PoolingMode::kMaximumDeterministic:
-    //   return CUDNN_POOLING_MAX_DETERMINISTIC;
-    case PoolingMode::kAverageExclusive:
-      return dynload::Pooling::Mode::AVGPOOL_COUNT_WITHOUT_PAD;
-    case PoolingMode::kAverageInclusive:
-      return dynload::Pooling::Mode::AVGPOOL_COUNT_PAD;
-    case PoolingMode::kMaximum:
-      return dynload::Pooling::Mode::MAXPOOL;
-    default:
-      PADDLE_THROW(
-          phi::errors::Unimplemented("Unexpected MUDNN pooling mode."));
-  }
-}
-
-template <typename T>
-class CudnnDataType;
-
-template <>
-class CudnnDataType<phi::dtype::bfloat16> {
- public:
-  static const dynload::Tensor::Type type = dynload::Tensor::Type::BFLOAT16;
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<phi::dtype::float16> {
- public:
-  static const dynload::Tensor::Type type = dynload::Tensor::Type::HALF;
-  // The scaling param type is float for HALF and FLOAT tensors
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<float> {
- public:
-  static const dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT;
-  using ScalingParamType = const float;
-  using BatchNormParamType = float;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-template <>
-class CudnnDataType<double> {
- public:
-  static const dynload::Tensor::Type type = dynload::Tensor::Type::DOUBLE;
-  using ScalingParamType = const double;
-  using BatchNormParamType = double;
-  static ScalingParamType* kOne() {
-    static ScalingParamType v = 1.0;
-    return &v;
-  }
-  static ScalingParamType* kZero() {
-    static ScalingParamType v = 0.0;
-    return &v;
-  }
-};
-
-inline dynload::Tensor::Format GetCudnnTensorFormat(
-    const DataLayout& order) {  // Not use
-  switch (order) {
-    case DataLayout::kNHWC:
-      return dynload::Tensor::Format::NHWC;
-    case DataLayout::kNCHW:
-      return dynload::Tensor::Format::NCHW;
-    case DataLayout::kNCDHW:
-      return dynload::Tensor::Format::NCDHW;
-    case DataLayout::kNDHWC:
-      return dynload::Tensor::Format::NDHWC;
-    default:
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "MUDNN has no equivalent dataLayout for input order."));
-  }
-  return dynload::Tensor::Format::NCHW;
-}
-
-class ScopedTensorDescriptor {
- public:
-  ScopedTensorDescriptor() {}
-  ~ScopedTensorDescriptor() PADDLE_MAY_THROW {}
-
-  inline dynload::Tensor descriptor(const dynload::Tensor::Format format,
-                                    const dynload::Tensor::Type type,
-                                    const std::vector<int>& dims,
-                                    const int groups = 1) {
-    // the format is not used now, will add later
-    std::vector<int64_t> strides(dims.size());
-    strides[dims.size() - 1] = 1;
-    for (int i = dims.size() - 2; i >= 0; i--) {
-      strides[i] = dims[i + 1] * strides[i + 1];
-    }
-    // Update tensor descriptor dims setting if groups > 1
-    // NOTE: Here, Assume using NCHW or NCDHW order
-    std::vector<int64_t> dims_with_group(dims.begin(), dims.end());
-    if (groups > 1) {
-      dims_with_group[1] = dims_with_group[1] / groups;
-    }
-
-    PADDLE_ENFORCE_EQ(
-        format,
-        dynload::Tensor::Format::NCHW,
-        phi::errors::InvalidArgument("format should ONLY be NCHW in MUDNN."));
-
-    desc_.SetNdInfo(
-        static_cast<int>(dims_with_group.size()), dims_with_group.data(), strides.data());
-    desc_.SetType(type);
-    desc_.SetFormat(format);
-
-    return desc_;
-  }
-
-  template <typename T>
-  inline dynload::Tensor& descriptor(const DataLayout& order,
-                                     const std::vector<int>& dims,
-                                     const int groups = 1) {
-    descriptor(
-        GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims, groups);
-    return desc_;
-  }
-
-  template <typename T>
-  inline dynload::Tensor& descriptor(const phi::DenseTensor& tensor,
-                                     const DataLayout& order,
-                                     const std::vector<int>& dims,
-                                     const int groups = 1) {
-    desc_.SetAddr(tensor.data());
-    descriptor<T>(order, dims, groups);
-    return desc_;
-  }
-
-  template <typename T>
-  inline dynload::Tensor& descriptor(const T* data,
-                                     const DataLayout& order,
-                                     const std::vector<int>& dims,
-                                     const int groups = 1) {
-    desc_.SetAddr(data);
-    descriptor<T>(order, dims, groups);
-    return desc_;
-  }
-
-  inline dynload::Tensor& descriptor(const dynload::Tensor::Type mudnn_type,
-                                     const std::vector<int>& dim,
-                                     const std::vector<int>& stride) {
-    std::vector<int64_t> dims_64(dim.begin(), dim.end());
-    std::vector<int64_t> stride_64(dim.begin(), dim.end());
-    desc_.SetType(mudnn_type);
-    desc_.SetNdInfo(static_cast<int>(dims_64.size()), dims_64.data(), stride_64.data());
-    return desc_;
-  }
-
-  template <typename T>
-  inline dynload::Tensor& descriptor(const std::vector<int>& dim,
-                                     const std::vector<int>& stride) {
-    descriptor(CudnnDataType<T>::type, dim, stride);
-    return desc_;
-  }
-
-  inline dynload::Tensor& desc() { return desc_; }
-
- private:
-  dynload::Tensor desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
-};
-
-class ScopedPoolingDescriptor {
- public:
-  ScopedPoolingDescriptor() {}
-  ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {}
-
-  inline dynload::Pooling& descriptor(const PoolingMode& mode,
-                                      const std::vector<int>& kernel,
-                                      const std::vector<int>& pads,
-                                      const std::vector<int>& strides) {
-    PADDLE_ENFORCE_EQ(kernel.size(),
-                      pads.size(),
-                      phi::errors::InvalidArgument(
-                          "The size of kernel and pads should be equal. But "
-                          "received size of kernel is %d, size of pads is %d.",
-                          kernel.size(),
-                          pads.size()));
-    PADDLE_ENFORCE_EQ(
-        kernel.size(),
-        strides.size(),
-        phi::errors::InvalidArgument(
-            "The size of kernel and strides should be equal. But "
-            "received size of kernel is %d, size of strides is %d.",
-            kernel.size(),
-            strides.size()));
-    const std::vector<int> dilation(kernel.size(), 1);
-    desc_.SetNdInfo(kernel.size(),
-                    kernel.data(),
-                    pads.data(),
-                    strides.data(),
-                    dilation.data());
-    desc_.SetMode(GetPoolingMode(mode));
-    return desc_;
-  }
-
-  dynload::Pooling& desc() { return desc_; }
-
- private:
-  dynload::Pooling desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
-};
-
-class ScopedSoftmaxDescriptor {
- public:
-  ScopedSoftmaxDescriptor() {}
-  ~ScopedSoftmaxDescriptor() PADDLE_MAY_THROW {}
-
-  inline dynload::Softmax& descriptor(const dynload::Softmax::Mode& mode,
-                                      const dynload::Softmax::Algorithm& algo,
-                                      const int& dim) {
-    desc_.SetMode(mode);
-    desc_.SetDim(dim);
-    desc_.SetAlgorithm(algo);
-    return desc_;
-  }
-
-  dynload::Softmax& desc() { return desc_; }
-
- private:
-  dynload::Softmax desc_;
-  DISABLE_COPY_AND_ASSIGN(ScopedSoftmaxDescriptor);
-};
-
-static void InternalMemFree(void* ptr) {
-  if (!ptr) {
-    return;
-  }
-  PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr));
-}
-
-static dynload::MemoryHandler InternalMemAlloc(size_t s) {
-  void* data = nullptr;
-  if (s) {
-    PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&data, s));
-  }
-  return dynload::MemoryHandler(data, InternalMemFree);
-}
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h
deleted file mode 100644
index f2847daf4dfacb..00000000000000
--- a/paddle/phi/backends/gpu/musa/musa_device_function.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define PADDLE_CUDA_FP16
-// NOTE(): support float16 to half in header file.
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/enforce.h"
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-#define FULL_WARP_MASK 0xFFFFFFFF
-#define CREATE_SHFL_MASK(mask, predicate) \
-  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-
-#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
-  case (dim): {                            \
-    constexpr auto kPowerOfTwoDim = (dim); \
-    __VA_ARGS__;                           \
-  } break
-
-#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
-  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
-  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
-  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
-  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
-  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
-}
-
-template <typename T>
-__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-                                                T val,
-                                                int width = warpSize) {
-  return __shfl_xor_sync(mask, val, width);
-}
-
-
-#if defined(PADDLE_WITH_MUSA)
-// Due to the inconsistency between mcc and nvcc, certain type conversions are not implicitly performed, so we specialize here.
-template <>
-__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(unsigned mask,
-                                                phi::dtype::float16 val,
-                                                int width) {
-  return (phi::dtype::float16)(__shfl_xor_sync(mask, float(val), width));
-}
-#endif
-
-template <>
-__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
-    unsigned mask, phi::dtype::float16 val, int delta, int width) {
-  return phi::dtype::float16(__shfl_down_sync(
-      mask, val.to_half(), static_cast<unsigned>(delta), width));
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
-    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
-#if defined(PADDLE_MUSA_BF16) && defined(__MUSA_ARCH__) && __MUSA_ARCH__ >= 220
-  return phi::dtype::bfloat16(__shfl_down_sync(
-      mask, val.to_mt_bfloat16(), static_cast<unsigned>(delta), width));
-#else
-  PADDLE_ENFORCE(
-      false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11.");
-#endif
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
-    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
-  float real = static_cast<float>(__shfl_down_sync(
-      mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
-  float imag = static_cast<float>(__shfl_down_sync(
-      mask, static_cast<float>(val.imag), static_cast<unsigned>(delta), width));
-  return phi::dtype::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
-    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
-  double real =
-      static_cast<double>(__shfl_down_sync(mask,
-                                           static_cast<double>(val.real),
-                                           static_cast<unsigned>(delta),
-                                           width));
-  double imag =
-      static_cast<double>(__shfl_down_sync(mask,
-                                           static_cast<double>(val.imag),
-                                           static_cast<unsigned>(delta),
-                                           width));
-  return phi::dtype::complex<double>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
-    unsigned mask, phi::dtype::bfloat16 val, int width) {
-#if defined(PADDLE_MUSA_BF16)
-  return phi::dtype::bfloat16(
-      __shfl_xor_sync(mask, val.to_mt_bfloat16(), width));
-#else
-  PADDLE_ENFORCE(
-      false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
-#endif
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
-    unsigned mask, phi::dtype::complex<float> val, int width) {
-  float real = static_cast<float>(
-      __shfl_xor_sync(mask, static_cast<float>(val.real), width));
-  float imag = static_cast<float>(
-      __shfl_xor_sync(mask, static_cast<float>(val.imag), width));
-  return phi::dtype::complex<float>(real, imag);
-}
-
-template <>
-__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
-    unsigned mask, phi::dtype::complex<double> val, int width) {
-  double real = static_cast<double>(
-      __shfl_xor_sync(mask, static_cast<double>(val.real), width));
-  double imag = static_cast<double>(
-      __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
-  return phi::dtype::complex<double>(real, imag);
-}
-
-template <typename T>
-__forceinline__ __device__ T
-CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
-  return __shfl_sync(mask, val, src_line, width);
-}
-
-template <typename T>
-HOSTDEVICE T Infinity() {
-  return INFINITY;
-}
-
-template <typename T>
-__device__ T reduceSum(T val, int tid, int len) {
-  // NOTE(zcd): The warp size should be taken from the
-  // parameters of the GPU but not specified as 32 simply.
-  // To make the reduceSum more efficiently,
-  // I use Warp-Level Parallelism and assume the Warp size
-  // is 32 which may be different for different GPU,
-  // but most card's warp size is 32.
-  const int warpSize = 32;
-  __shared__ T shm[warpSize];
-  unsigned mask = 0u;
-  CREATE_SHFL_MASK(mask, tid < len);
-
-  for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
-
-  if (tid < warpSize) shm[tid] = 0;
-  __syncthreads();
-
-  if (tid % warpSize == 0) {
-    shm[tid / warpSize] = val;
-  }
-  __syncthreads();
-
-  CREATE_SHFL_MASK(mask, tid < warpSize);
-
-  if (tid < warpSize) {
-    val = shm[tid];
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-      val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
-  }
-  return val;
-}
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h
deleted file mode 100644
index 7463edc5d9ff60..00000000000000
--- a/paddle/phi/backends/gpu/musa/musa_helper.h
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-/*
- * Summary: Grid stride looping macro in CUDA kernel
- *
- *  [ Why need this macro? ]
- *
- *    The original looping in CUDA kernel is:
- *
- *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
- *        i += blockDim.x * gridDim.x)`
- *
- *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
- *    may be large, such as over 1GB, the first iteration is no problem here,
- *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
- *    will greater than INT_MAX and overflow becomes negative value, at
- *    this time, the cycle condition `i < (n)` is still satisfied, so it
- *    will cause illegal access to cuda memory.
- *
- *    Here is a real example in ERINE, it will trigger above error.
- *    The related data are:
- *      - blockIdx.x = 2172938
- *      - blockDim.x = 512
- *      - blockIdx.x * blockDim.x = 1112543864
- *      - INT_MAX = 2147483647
- *
- *    So we polish the for condition as follow, the int64_t __index__ will
- *    prevent overflow in the loop increment.
- *
- * Parameters:
- *    - i: loop index
- *    - num: total element numbers
- *
- * Examples:
- *    template <typename T>
- *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
- *                      const int d, const int remain) {
- *    CUDA_KERNEL_LOOP(index, num) {
- *      int idx_n = index / d;
- *      int idx_remain = index % remain;
- *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
- *      }
- *    }
- *
- */
-
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
-  int64_t __index__ =                                                \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
-  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
-  for (index_type i = __index__; __index__ < (num);                  \
-       __index__ += __stride__, i = __index__)
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
deleted file mode 100644
index cab81b58f5ecb2..00000000000000
--- a/paddle/phi/backends/gpu/musa/musa_info.cc
+++ /dev/null
@@ -1,334 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <array>
-#include <mutex>
-
-#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-
-#include "paddle/phi/core/enforce.h"
-
-#include "musa_runtime.h"
-
-static std::once_flag g_device_props_size_init_flag;
-static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
-static std::vector<phi::gpuDeviceProp> g_device_props;
-
-namespace phi {
-namespace backends {
-namespace gpu {
-
-int DnnVersion() {
-  if (!dynload::HasCUDNN()) return -1;
-  // TODO(@caizhi): mudnnGetVersion is not supported now.
-  // version info will be returned from mudnnGetVersion later.
-  const int version_major = 2;
-  const int version_minor = 5;
-  const int version_patch = 0;
-  return version_major * 1000 + version_minor * 100 + version_patch;
-}
-
-static int GetGPUDeviceCountImpl() {
-  int driverVersion = 0;
-  musaError_t status = musaDriverGetVersion(&driverVersion);
-
-  if (!(status == gpuSuccess && driverVersion != 0)) {
-    // No GPU driver
-    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
-    return 0;
-  }
-
-  const auto *musa_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
-
-  if (musa_visible_devices != nullptr) {
-    std::string musa_visible_devices_str(musa_visible_devices);
-    if (!musa_visible_devices_str.empty()) {
-      musa_visible_devices_str.erase(
-          0, musa_visible_devices_str.find_first_not_of('\''));
-      musa_visible_devices_str.erase(
-          musa_visible_devices_str.find_last_not_of('\'') + 1);
-      musa_visible_devices_str.erase(
-          0, musa_visible_devices_str.find_first_not_of('\"'));
-      musa_visible_devices_str.erase(
-          musa_visible_devices_str.find_last_not_of('\"') + 1);
-    }
-    if (std::all_of(musa_visible_devices_str.begin(),
-                    musa_visible_devices_str.end(),
-                    [](char ch) { return ch == ' '; })) {
-      VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
-                 "empty. No GPU detected.";
-      return 0;
-    }
-  }
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count));
-  return count;
-}
-
-int GetGPUDeviceCount() {
-  // cache the count
-  static auto dev_cnt = GetGPUDeviceCountImpl();
-  return dev_cnt;
-}
-
-int GetGPUComputeCapability(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int major, minor;
-  auto major_error_code =
-      musaDeviceGetAttribute(&major, musaDevAttrComputeCapabilityMajor, id);
-  auto minor_error_code =
-      musaDeviceGetAttribute(&minor, musaDevAttrComputeCapabilityMinor, id);
-
-  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
-  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
-  return major * 10 + minor;
-}
-
-int GetGPURuntimeVersion(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int runtime_version = 0;
-  // Note: runtime_version = MAJOR * 10000 + MINOR * 100 + PATCH
-  PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version));
-  return runtime_version;
-}
-
-int GetGPUDriverVersion(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int driver_version = 0;
-  // Note: driver_version = MAJOR * 10000 + MINOR * 100 + PATCH
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version));
-  return driver_version;
-}
-
-bool TensorCoreAvailable() { return false; }
-
-int GetGPUMultiProcessors(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id));
-  return count;
-}
-
-int GetGPUMaxThreadsPerMultiProcessor(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
-      &count, musaDevAttrMaxThreadsPerMultiProcessor, id));
-
-  return count;
-}
-
-int GetGPUMaxThreadsPerBlock(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  int count;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id));
-  return count;
-}
-
-int GetCurrentDeviceId() {
-  int device_id;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id));
-  return device_id;
-}
-
-std::array<int, 3> GetGpuMaxGridDimSize(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  std::array<int, 3> ret;
-  int size;
-  auto error_code_x = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
-  ret[0] = size;
-
-  auto error_code_y = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
-  ret[1] = size;
-
-  auto error_code_z = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id);
-  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
-  ret[2] = size;
-  return ret;
-}
-
-std::pair<int, int> GetGpuStreamPriorityRange() {
-  int least_priority, greatest_priority;
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
-  return std::make_pair(least_priority, greatest_priority);
-}
-
-const gpuDeviceProp &GetDeviceProperties(int id) {
-  std::call_once(g_device_props_size_init_flag, [&] {
-    int gpu_num = 0;
-    gpu_num = GetGPUDeviceCount();
-    g_device_props_init_flags.resize(gpu_num);
-    g_device_props.resize(gpu_num);
-    for (int i = 0; i < gpu_num; ++i) {
-      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
-    }
-  });
-
-  if (id == -1) {
-    id = GetCurrentDeviceId();
-  }
-
-  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
-    PADDLE_THROW(phi::errors::OutOfRange(
-        "The device id %d is out of range [0, %d), where %d is the number of "
-        "devices on this machine. Because the device id should be greater than "
-        "or equal to zero and smaller than the number of gpus. Please input "
-        "appropriate device again!",
-        id,
-        static_cast<int>(g_device_props.size()),
-        static_cast<int>(g_device_props.size())));
-  }
-
-  std::call_once(*(g_device_props_init_flags[id]), [&] {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        musaGetDeviceProperties(&g_device_props[id], id));
-  });
-  //TODO@mtai:we hope not to skip UT that ask compute capacity to be greater than 7/8
-  g_device_props[id].major = 9;
-  g_device_props[id].minor = 9;
-  return g_device_props[id];
-}
-
-void SetDeviceId(int id) {
-  PADDLE_ENFORCE_LT(
-      id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   id,
-                                   GetGPUDeviceCount()));
-  PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id));
-}
-
-void GpuMemcpyAsync(void *dst,
-                    const void *src,
-                    size_t count,
-                    gpuMemcpyKind kind,
-                    gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
-}
-
-void GpuMemcpySync(void *dst,
-                   const void *src,
-                   size_t count,
-                   gpuMemcpyKind kind) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
-
-}
-
-void GpuMemcpyPeerAsync(void *dst,
-                        int dst_device,
-                        const void *src,
-                        int src_device,
-                        size_t count,
-                        gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
-}
-
-void GpuMemcpyPeerSync(
-    void *dst, int dst_device, const void *src, int src_device, size_t count) {
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      musaMemcpyPeer(dst, dst_device, src, src_device, count));
-}
-
-void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream));
-}
-
-void GpuStreamSync(gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
-}
-
-void GpuDestroyStream(gpuStream_t stream) {
-  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
-}
-
-void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); }
-
-gpuError_t GpuGetLastError() { return musaGetLastError(); }
-
-bool IsGPUManagedMemorySupported(int dev_id) {
-  PADDLE_ENFORCE_LT(
-      dev_id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   dev_id,
-                                   GetGPUDeviceCount()));
-  return false;
-}
-
-bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
-  PADDLE_ENFORCE_LT(
-      dev_id,
-      GetGPUDeviceCount(),
-      phi::errors::InvalidArgument("Device id must be less than GPU count, "
-                                   "but received id is: %d. GPU count is: %d.",
-                                   dev_id,
-                                   GetGPUDeviceCount()));
-  return false;
-}
-
-}  // namespace gpu
-}  // namespace backends
-}  // namespace phi
diff --git a/paddle/phi/capi/include/c_meta_tensor.h b/paddle/phi/capi/include/c_meta_tensor.h
index 08f01084c6abf3..f4c9a541e526aa 100644
--- a/paddle/phi/capi/include/c_meta_tensor.h
+++ b/paddle/phi/capi/include/c_meta_tensor.h
@@ -39,6 +39,13 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor,
                             size_t index,
                             PD_Status *status);
 
+int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor,
+                                   PD_Status *status);
+
+int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor,
+                               size_t index,
+                               PD_Status *status);
+
 bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status);
 
 void PD_MetaTensorSetDims(PD_MetaTensor *tensor,
@@ -46,6 +53,11 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor,
                           const int64_t *dims,
                           PD_Status *status);
 
+void PD_MetaTensorSetStrides(PD_MetaTensor *tensor,
+                             int64_t nstrides,
+                             const int64_t *strides,
+                             PD_Status *status);
+
 void PD_MetaTensorSetDataType(PD_MetaTensor *tensor,
                               PD_DataType dtype,
                               PD_Status *status);
diff --git a/paddle/phi/capi/include/c_tensor.h b/paddle/phi/capi/include/c_tensor.h
index c4f706c70ccfb4..2df292c6b946b2 100644
--- a/paddle/phi/capi/include/c_tensor.h
+++ b/paddle/phi/capi/include/c_tensor.h
@@ -41,6 +41,12 @@ int64_t PD_TensorGetDim(const PD_Tensor *tensor,
                         size_t index,
                         PD_Status *status);
 
+int64_t PD_TensorGetNumStrides(const PD_Tensor *tensor, PD_Status *status);
+
+int64_t PD_TensorGetStride(const PD_Tensor *tensor,
+                           size_t index,
+                           PD_Status *status);
+
 void PD_TensorGetLoD(const PD_Tensor *tensor,
                      PD_List *data,
                      PD_List *offset,
@@ -52,11 +58,22 @@ bool PD_TensorIsValid(const PD_Tensor *tensor, PD_Status *status);
 
 void *PD_TensorGetHolder(const PD_Tensor *tensor, PD_Status *status);
 
+size_t PD_TensorGetOffset(const PD_Tensor *tensor, PD_Status *status);
+
 void PD_TensorSetDims(PD_Tensor *tensor,
                       int64_t ndims,
                       const int64_t *dims,
                       PD_Status *status);
 
+void PD_TensorSetOffset(PD_Tensor *tensor,
+                        const int64_t offset,
+                        PD_Status *status);
+
+void PD_TensorSetStrides(PD_Tensor *tensor,
+                         int64_t nstrides,
+                         const int64_t *strides,
+                         PD_Status *status);
+
 void PD_TensorSetDataType(PD_Tensor *tensor,
                           PD_DataType dtype,
                           PD_Status *status);
diff --git a/paddle/phi/capi/include/wrapper_base.h b/paddle/phi/capi/include/wrapper_base.h
index 061561008a95e7..75f3e2d9e350eb 100644
--- a/paddle/phi/capi/include/wrapper_base.h
+++ b/paddle/phi/capi/include/wrapper_base.h
@@ -72,6 +72,19 @@ inline std::vector<int64_t> PD_TensorGetDims(PD_Tensor* tensor,
   return std::vector<int64_t>();
 }
 
+inline std::vector<int64_t> PD_TensorGetStrides(PD_Tensor* tensor,
+                                                PD_Status* status) {
+  int64_t nstrides = PD_TensorGetNumStrides(tensor, status);
+  if (nstrides > 0) {
+    std::vector<int64_t> shape(nstrides);
+    for (int64_t i = 0; i < nstrides; ++i) {
+      shape[i] = PD_TensorGetStride(tensor, i, status);
+    }
+    return shape;
+  }
+  return std::vector<int64_t>();
+}
+
 inline std::vector<int64_t> PD_MetaTensorGetDims(PD_MetaTensor* tensor,
                                                  PD_Status* status) {
   int64_t ndims = PD_MetaTensorGetNumDims(tensor, status);
@@ -85,6 +98,19 @@ inline std::vector<int64_t> PD_MetaTensorGetDims(PD_MetaTensor* tensor,
   return std::vector<int64_t>();
 }
 
+inline std::vector<int64_t> PD_MetaTensorGetStrides(PD_MetaTensor* tensor,
+                                                    PD_Status* status) {
+  int64_t nstrides = PD_MetaTensorGetNumStrides(tensor, status);
+  if (nstrides > 0) {
+    std::vector<int64_t> shape(nstrides);
+    for (int64_t i = 0; i < nstrides; ++i) {
+      shape[i] = PD_MetaTensorGetStride(tensor, i, status);
+    }
+    return shape;
+  }
+  return std::vector<int64_t>();
+}
+
 template <typename T>
 class WrapperBase {
  public:
@@ -134,6 +160,13 @@ class DenseTensor : public WrapperBase<PD_Tensor> {
     return holder;
   }
 
+  size_t offset() const {
+    C_Status status;
+    auto offset = PD_TensorGetOffset(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return offset;
+  }
+
   std::vector<int64_t> dims() const {
     C_Status status;
     auto dimension = PD_TensorGetDims(raw_data(), &status);
@@ -141,6 +174,13 @@ class DenseTensor : public WrapperBase<PD_Tensor> {
     return dimension;
   }
 
+  std::vector<int64_t> strides() const {
+    C_Status status;
+    auto strides = PD_TensorGetStrides(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return strides;
+  }
+
   PD_DataType dtype() const {
     C_Status status;
     auto data_type = PD_TensorGetPDDataType(raw_data(), &status);
@@ -207,6 +247,18 @@ class DenseTensor : public WrapperBase<PD_Tensor> {
     PD_CHECK_STATUS(status);
   }
 
+  void set_offset(const int64_t& offset) {
+    C_Status status;
+    PD_TensorSetOffset(raw_data(), offset, &status);
+    PD_CHECK_STATUS(status);
+  }
+
+  void set_strides(const std::vector<int64_t>& strides) {
+    C_Status status;
+    PD_TensorSetStrides(raw_data(), strides.size(), strides.data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+
   void set_dtype(PD_DataType data_type) {
     C_Status status;
     PD_TensorSetDataType(raw_data(), data_type, &status);
@@ -513,6 +565,13 @@ class MetaTensor : WrapperBase<PD_MetaTensor> {
     return dimension;
   }
 
+  std::vector<int64_t> strides() const {
+    C_Status status;
+    auto strides = PD_MetaTensorGetStrides(raw_data(), &status);
+    PD_CHECK_STATUS(status);
+    return strides;
+  }
+
   PD_DataType dtype() const {
     C_Status status;
     auto data_type = PD_MetaTensorGetPDDataType(raw_data(), &status);
@@ -540,6 +599,13 @@ class MetaTensor : WrapperBase<PD_MetaTensor> {
     PD_CHECK_STATUS(status);
   }
 
+  void set_strides(const std::vector<int64_t>& strides) {
+    C_Status status;
+    PD_MetaTensorSetStrides(
+        raw_data(), strides.size(), strides.data(), &status);
+    PD_CHECK_STATUS(status);
+  }
+
   void set_dtype(PD_DataType data_type) {
     C_Status status;
     PD_MetaTensorSetDataType(raw_data(), data_type, &status);
diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc
index 6dc1ff768260d7..b415ece7e361d2 100644
--- a/paddle/phi/capi/lib/c_device_context.cc
+++ b/paddle/phi/capi/lib/c_device_context.cc
@@ -35,7 +35,7 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx,
         reinterpret_cast<const phi::CustomContext*>(ctx)->stream());
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return nullptr;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_Stream>(
         reinterpret_cast<const phi::GPUContext*>(ctx)->stream());
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
index 7df79117dbae5d..e9fe2aada1f35f 100644
--- a/paddle/phi/capi/lib/c_kernel_context.cc
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -30,7 +30,7 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) {
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CPUContext*>(
         &kernel_context->GetDeviceContext<phi::CPUContext>()));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::GPUContext*>(
         &kernel_context->GetDeviceContext<phi::GPUContext>()));
diff --git a/paddle/phi/capi/lib/c_meta_tensor.cc b/paddle/phi/capi/lib/c_meta_tensor.cc
index 6ea6eda1a7f23e..f436ba9d3cde0d 100644
--- a/paddle/phi/capi/lib/c_meta_tensor.cc
+++ b/paddle/phi/capi/lib/c_meta_tensor.cc
@@ -88,6 +88,36 @@ int64_t PD_MetaTensorGetDim(const PD_MetaTensor *tensor,
   return cc_tensor->dims()[index];
 }
 
+int64_t PD_MetaTensorGetNumStrides(const PD_MetaTensor *tensor,
+                                   PD_Status *status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::MetaTensor *>(tensor);
+  return cc_tensor->strides().size();
+}
+
+int64_t PD_MetaTensorGetStride(const PD_MetaTensor *tensor,
+                               size_t index,
+                               PD_Status *status) {
+  auto cc_tensor = reinterpret_cast<const phi::MetaTensor *>(tensor);
+
+  if (status) {
+    if (!tensor || index >= static_cast<size_t>(cc_tensor->strides().size())) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  return cc_tensor->strides()[index];
+}
+
 bool PD_MetaTensorIsValid(const PD_MetaTensor *tensor, PD_Status *status) {
   if (status) {
     if (!tensor) {
@@ -117,6 +147,22 @@ void PD_MetaTensorSetDims(PD_MetaTensor *tensor,
   cc_tensor->set_dims(common::make_ddim(shape));
 }
 
+void PD_MetaTensorSetStrides(PD_MetaTensor *tensor,
+                             int64_t nstrides,
+                             const int64_t *strides,
+                             PD_Status *status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::MetaTensor *>(tensor);
+  std::vector<int> shape(strides, strides + nstrides);
+  cc_tensor->set_strides(common::make_ddim(shape));
+}
+
 void PD_MetaTensorSetDataType(PD_MetaTensor *tensor,
                               PD_DataType dtype,
                               PD_Status *status) {
diff --git a/paddle/phi/capi/lib/c_tensor.cc b/paddle/phi/capi/lib/c_tensor.cc
index 31a724447b7c7f..eb8c8c6f4eb47d 100644
--- a/paddle/phi/capi/lib/c_tensor.cc
+++ b/paddle/phi/capi/lib/c_tensor.cc
@@ -111,6 +111,35 @@ int64_t PD_TensorGetDim(const PD_Tensor* tensor,
   return cc_tensor->dims()[index];
 }
 
+int64_t PD_TensorGetNumStrides(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->strides().size();
+}
+
+int64_t PD_TensorGetStride(const PD_Tensor* tensor,
+                           size_t index,
+                           PD_Status* status) {
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+
+  if (status) {
+    if (!tensor || index >= static_cast<size_t>(cc_tensor->strides().size())) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  return cc_tensor->strides()[index];
+}
+
 void PD_TensorGetLoD(const PD_Tensor* tensor,
                      PD_List* data,
                      PD_List* offset,
@@ -185,6 +214,19 @@ void* PD_TensorGetHolder(const PD_Tensor* tensor, PD_Status* status) {
   return cc_tensor->Holder().get();
 }
 
+size_t PD_TensorGetOffset(const PD_Tensor* tensor, PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return 0;
+    }
+    *status = C_SUCCESS;
+  }
+
+  auto cc_tensor = reinterpret_cast<const phi::DenseTensor*>(tensor);
+  return cc_tensor->offset();
+}
+
 void PD_TensorSetDims(PD_Tensor* tensor,
                       int64_t ndims,
                       const int64_t* dims,
@@ -201,6 +243,36 @@ void PD_TensorSetDims(PD_Tensor* tensor,
   cc_tensor->Resize(common::make_ddim(shape));
 }
 
+void PD_TensorSetOffset(PD_Tensor* tensor,
+                        const int64_t offset,
+                        PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  cc_tensor->set_offset(offset);
+}
+
+void PD_TensorSetStrides(PD_Tensor* tensor,
+                         int64_t nstrides,
+                         const int64_t* strides,
+                         PD_Status* status) {
+  if (status) {
+    if (!tensor) {
+      *status = C_FAILED;
+      return;
+    }
+    *status = C_SUCCESS;
+  }
+  auto cc_tensor = reinterpret_cast<phi::DenseTensor*>(tensor);
+  std::vector<int> shape(strides, strides + nstrides);
+  cc_tensor->set_strides(common::make_ddim(shape));
+}
+
 void PD_TensorSetDataType(PD_Tensor* tensor,
                           PD_DataType dtype,
                           PD_Status* status) {
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 4f238496c41494..64dab3ccdeb3b4 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -138,7 +138,7 @@ inline Backend StringToBackend(const char* backend_cstr) {
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
   } else if (s == std::string("KPS")) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs
     // to be converted
     // to GPU in the GPU environment
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 9609dc50a9a0be..028851e34c8bc7 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -31,13 +31,7 @@
 #include <cuda_bf16.h>
 #endif
 
-#if defined(__MUSACC__)
-#define PADDLE_MUSA_BF16
-#include <musa_bf16.h>
-#include <musa.h>
-#endif
-
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -71,14 +65,13 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
-#elif defined(PADDLE_CUDA_BF16)
+#else
+#if defined(PADDLE_CUDA_BF16)
     __nv_bfloat16 tmp = __float2bfloat16(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
-#elif defined(PADDLE_MUSA_BF16)
-    __mt_bfloat16 tmp = __float2bfloat16(val);
-    x = *reinterpret_cast<uint16_t*>(&tmp);
 #else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
 #endif
   }
 
@@ -88,12 +81,6 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
-#if defined(PADDLE_MUSA_BF16)
-  HOSTDEVICE inline explicit bfloat16(const __mt_bfloat16& val) {
-    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
-  }
-#endif
-
   template <class T>
   HOSTDEVICE inline explicit bfloat16(const T& val)
       : x(bfloat16(static_cast<float>(val)).x) {}
@@ -106,13 +93,6 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
-#if defined(PADDLE_MUSA_BF16)
-  HOSTDEVICE inline bfloat16& operator=(const __mt_bfloat16& val) {
-    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
-    return *this;
-  }
-#endif
-
   HOSTDEVICE inline bfloat16& operator=(bool b) {
     x = b ? 0x3f80 : 0;
     return *this;
@@ -180,16 +160,16 @@ struct PADDLE_ALIGN(2) bfloat16 {
     // return res;
     res = res << 16;
     return *reinterpret_cast<float*>(&res);
-#elif defined(PADDLE_CUDA_BF16)
+#else
+#ifdef PADDLE_CUDA_BF16
     return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
-#elif defined(PADDLE_MUSA_BF16)
-    return __bfloat162float(*reinterpret_cast<const __mt_bfloat16*>(&x));
 #else
     float val = 0.f;
     uint16_t temp = x;
     std::memcpy(
         reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
     return val;
+#endif
 #endif
   }
 
@@ -199,12 +179,6 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
-#ifdef PADDLE_MUSA_BF16
-  HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const {
-    return *reinterpret_cast<const __mt_bfloat16*>(&x);
-  }
-#endif
-
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
   HOSTDEVICE inline explicit operator int8_t() const {
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index 4fb04ed0f7f666..5de6290fb77057 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -26,17 +26,12 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA
 
-#ifdef PADDLE_WITH_MUSA
-#include <muComplex.h>
-#include <thrust/complex.h>
-#endif  // PADDLE_WITH_MUSA
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_complex.h>
 #include <thrust/complex.h>  // NOLINT
 #endif
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -46,7 +41,7 @@
 #define PADDLE_ALIGN(x)
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 // todo
 #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
 #endif
@@ -71,7 +66,7 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
 
   HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
   template <typename T1>
   HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
@@ -100,14 +95,6 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
   HOSTDEVICE inline explicit operator hipDoubleComplex() const {
     return make_hipDoubleComplex(real, imag);
   }
-#elif defined(PADDLE_WITH_MUSA)
-  HOSTDEVICE inline explicit operator muFloatComplex() const {
-    return make_muFloatComplex(real, imag);
-  }
-
-  HOSTDEVICE inline explicit operator muDoubleComplex() const {
-    return make_muDoubleComplex(real, imag);
-  }  
 #else
   HOSTDEVICE inline explicit operator cuFloatComplex() const {
     return make_cuFloatComplex(real, imag);
diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h
index c88d4ac21cd4a0..1906fd4e57a444 100644
--- a/paddle/phi/common/cpstring_impl.h
+++ b/paddle/phi/common/cpstring_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 
 #include "paddle/common/macros.h"
 
-#if (defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__))
+#if (defined(__NVCC__) || defined(__HIPCC__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
@@ -77,7 +77,7 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
 }
 #endif
 
-#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__))
+#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__))
 #define PD_le32toh(x) x
 #else  // PD_PSTRING_LITTLE_ENDIAN
 #define PD_le32toh(x) swap32(x)
@@ -209,7 +209,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); }
 HOSTDEVICE static inline void *PD_Realloc(void *ptr,
                                           size_t old_size UNUSED,
                                           size_t new_size) {
-#if (defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__))
+#if (defined(__NVCC__) || defined(__HIPCC__))
   if (old_size >= new_size) {
     return ptr;
   }
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index e4f4a5ae272eb9..9d60b8c6241ae3 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -37,10 +37,6 @@
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#endif  // PADDLE_WITH_MUSA
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -50,17 +46,12 @@
 #include <cuda_fp16.h>
 #endif
 
-#ifdef __MUSACC__
-#define PADDLE_CUDA_FP16
-#include <musa_fp16.h>
-#endif
-
 #ifdef __HIPCC__
 #define PADDLE_CUDA_FP16
 #include <hip/hip_fp16.h>
 #endif
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -95,8 +86,8 @@ struct PADDLE_ALIGN(2) float16 {
 // Constructors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
     x = h.x;
@@ -115,7 +106,7 @@ struct PADDLE_ALIGN(2) float16 {
 
   HOSTDEVICE inline explicit float16(float val) {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = __float2half(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 
@@ -157,7 +148,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
     x = rhs.x;
@@ -231,7 +222,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Conversion operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline half to_half() const {
-#if defined(PADDLE_WITH_HIP)  || defined(PADDLE_WITH_MUSA)|| CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
     return half(h);
@@ -251,7 +242,7 @@ struct PADDLE_ALIGN(2) float16 {
 
   HOSTDEVICE inline operator float() const {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
@@ -360,7 +351,7 @@ struct PADDLE_ALIGN(2) float16 {
 // CUDA 9.0 regarding the half data type.
 // ROCM has built-in arithmetic operators as not defined
 // __HIP_NO_HALF_OPERATORS__
-#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && !defined(__MUSACC__) && CUDA_VERSION < 9000
+#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000
 DEVICE inline half operator+(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hadd(a, b);
@@ -408,7 +399,7 @@ DEVICE inline half operator-(const half& a) {
 #endif
 }
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA)   // not defined __HIP_NO_HALF_OPERATORS__
+#ifndef PADDLE_WITH_HIP  // not defined __HIP_NO_HALF_OPERATORS__
 DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
@@ -484,7 +475,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 #if defined(PADDLE_CUDA_FP16)
 // HIPCC has compile error if call __device__ function __hadd, __hsub, etc.
 // in __host__ __device__ function
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline float16 operator+(const float16& a, const float16& b) {
   return float16(__hadd(a.to_half(), b.to_half()));
 }
@@ -501,7 +492,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a, const float16& b) {
   return float16(__hsub(a.to_half(), b.to_half()));
 }
@@ -518,7 +509,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline float16 operator*(const float16& a, const float16& b) {
   return float16(__hmul(a.to_half(), b.to_half()));
 }
@@ -535,7 +526,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline float16 operator/(const float16& a, const float16& b) {
   return float16(__hdiv(a.to_half(), b.to_half()));
 }
@@ -555,7 +546,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline float16 operator-(const float16& a) {
   return float16(__hneg(a.to_half()));
 }
@@ -598,7 +589,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
 
 // HIPCC has compile error if call __device__ function __heq, __hne, etc.
 // in __host__ __device__ function
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline bool operator==(const float16& a, const float16& b) {
   return __heq(a.to_half(), b.to_half());
 }
@@ -615,7 +606,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline bool operator!=(const float16& a, const float16& b) {
   return __hne(a.to_half(), b.to_half());
 }
@@ -632,7 +623,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline bool operator<(const float16& a, const float16& b) {
   return __hlt(a.to_half(), b.to_half());
 }
@@ -649,7 +640,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline bool operator<=(const float16& a, const float16& b) {
   return __hle(a.to_half(), b.to_half());
 }
@@ -666,7 +657,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline bool operator>(const float16& a, const float16& b) {
   return __hgt(a.to_half(), b.to_half());
 }
@@ -683,7 +674,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__) || defined(__MUSACC__)
+#if defined(__HIPCC__)
 DEVICE inline bool operator>=(const float16& a, const float16& b) {
   return __hge(a.to_half(), b.to_half());
 }
@@ -974,7 +965,7 @@ DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); }
 HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
 #else
 HOSTDEVICE inline bool(isnan)(const float16& a) {
-#if defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(__MUSACC__))
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hisnan(a.to_half());
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -992,7 +983,7 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
 
 HOSTDEVICE inline float16(abs)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530))
+    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530))
   return float16(::fabs(static_cast<float>(a)));
 #else
   return float16(std::abs(static_cast<float>(a)));
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index a1fc14073d96ac..1af8cc442a1178 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -69,7 +69,7 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
                                                               dev_id);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void GpuMemoryUsage(size_t* available, size_t* total) {
   return MemoryUtils::Instance().GpuMemoryUsage(available, total);
 }
@@ -90,8 +90,8 @@ void EmplaceDeviceContexts(
       stream_priority);
 }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
 const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
   return MemoryUtils::Instance().GetAllocator(device_id, stream);
 }
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index abcc6ac003c644..9e4e573277549a 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -34,11 +34,6 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#include <musa.h>
-#endif
-
 namespace phi {
 
 struct MemoryInterface {
@@ -133,7 +128,7 @@ struct MemoryInterface {
   int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
                                               int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   /**
    * @brief get the memory usage of current GPU device.
    *
@@ -166,8 +161,8 @@ struct MemoryInterface {
       bool disable_setting_default_stream_for_allocator,
       int stream_priority);
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
   phi::Allocator* (*get_allocator)(int device_id, phi::gpuStream_t stream);
   phi::Allocator* (*get_host_allocator)();
   phi::Allocator* (*get_zero_allocator)(int device_id);
@@ -297,7 +292,7 @@ class MemoryUtils {
     return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void GpuMemoryUsage(size_t* available, size_t* total) {
     CheckMemoryMethod();
     PADDLE_ENFORCE_NOT_NULL(
@@ -349,8 +344,8 @@ class MemoryUtils {
             "Fluid. You can call InitMemoryMethod() for initialization."));
   }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
   const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
     return memory_method_->get_allocator(device_id, stream);
   }
@@ -426,7 +421,7 @@ void Copy(const Place& dst_place,
 
 int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void GpuMemoryUsage(size_t* available, size_t* total);
 #endif
 
@@ -439,8 +434,8 @@ void EmplaceDeviceContexts(
     bool disable_setting_default_stream_for_allocator,
     int stream_priority);
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
 const Allocator* GetAllocator(int device_id, phi::gpuStream_t stream);
 
 const Allocator* GetHostAllocator();
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index c205bb7675393f..008f45aa935544 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -129,7 +129,7 @@ static int8_t GetCorrectDeviceIdByPlaceType(
   switch (place_type) {
     case paddle::PlaceType::kCPU:
       return 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case paddle::PlaceType::kGPU:
       return phi::backends::gpu::GetCurrentDeviceId();
 #endif
@@ -175,7 +175,7 @@ bool operator==(PlaceType place_type, const Place &place) {
 
 GPUPlace DefaultGPUPlace() {
   return GPUPlace(
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       phi::backends::gpu::GetCurrentDeviceId());
 #else
       0);
diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h
index 0b1a94aa0c1b90..e80561284b885f 100644
--- a/paddle/phi/common/transform.h
+++ b/paddle/phi/common/transform.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "thrust/device_ptr.h"
@@ -92,7 +92,7 @@ struct Transform<phi::CPUContext> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
 // PointerToThrustDevicePtr has two specializations, one casts a (CUDA
 // device) pointer into thrust::device_ptr, the other keeps rest types
@@ -153,12 +153,6 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(last),
                       CastToCUDATransformIterator(result),
                       op);
-#elif defined(__MUSACC__)
-    thrust::transform(thrust::musa::par.on(context.stream()),
-                      CastToCUDATransformIterator(first),
-                      CastToCUDATransformIterator(last),
-                      CastToCUDATransformIterator(result),
-                      op);             
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first),
@@ -190,13 +184,6 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(first2),
                       CastToCUDATransformIterator(result),
                       op);
-#elif defined(__MUSACC__)
-    thrust::transform(thrust::musa::par.on(context.stream()),
-                      CastToCUDATransformIterator(first1),
-                      CastToCUDATransformIterator(last1),
-                      CastToCUDATransformIterator(first2),
-                      CastToCUDATransformIterator(result),
-                      op);
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first1),
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 15585543417d8e..d4c5de0dbe6dc9 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -61,7 +61,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
       return phi::CPUPlace();
     case phi::Backend::UNDEFINED:
       return phi::Place();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case phi::Backend::GPU:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -70,7 +70,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::ONEDNN:  // NOLINT
       return phi::CPUPlace();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -81,7 +81,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     case phi::Backend::KPS:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #elif defined(PADDLE_WITH_XPU_KP)
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index 50c07b6e2cc46b..b27770b0814339 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -23,11 +23,6 @@ limitations under the License. */
 using gpuStream_t = cudaStream_t;
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-using gpuStream_t = musaStream_t;
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 using gpuStream_t = hipStream_t;
@@ -78,9 +73,6 @@ class CUDAStream {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority(
-        &stream, static_cast<unsigned int>(flag), priority));     
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
@@ -100,8 +92,6 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream()));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream()));
 #endif
@@ -122,14 +112,6 @@ class CUDAStream {
     if (err == hipErrorNotReady) {
       return false;
     }
-#elif defined(PADDLE_WITH_MUSA)
-    musaError_t err = musaStreamQuery(raw_stream());
-    if (err == musaSuccess) {
-      return true;
-    }
-    if (err == musaErrorNotReady) {
-      return false;
-    }
 #else
     cudaError_t err = cudaStreamQuery(raw_stream());
     if (err == cudaSuccess) {
@@ -152,8 +134,6 @@ class CUDAStream {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0));
 #endif
@@ -166,8 +146,6 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       hipStreamDestroy(raw_stream());
-#elif defined(PADDLE_WITH_MUSA)
-      musaStreamDestroy(raw_stream());
 #else
       cudaStreamDestroy(raw_stream());
 #endif
diff --git a/paddle/phi/core/distributed/CMakeLists.txt b/paddle/phi/core/distributed/CMakeLists.txt
index 34046df6013a57..00000c3fff9e0f 100644
--- a/paddle/phi/core/distributed/CMakeLists.txt
+++ b/paddle/phi/core/distributed/CMakeLists.txt
@@ -4,7 +4,7 @@ add_subdirectory(auto_parallel)
 
 set(DISTRIBUTED_COMMON_SRCS comm_context_manager.cc)
 
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   list(APPEND DISTRIBUTED_COMMON_SRCS comm_task_manager.cc)
   list(APPEND DISTRIBUTED_COMMON_SRCS nccl_comm_context.cc nccl_comm_task.cc
        nccl_tools.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
index 9407d1fad7f428..e7a1ec15da307a 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
@@ -101,7 +101,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
           store, unique_comm_key, dev_ctx.GetPlace(), rank, world_size);
 #endif
     } else {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       if (phi::GPUContext::classof(&dev_ctx)) {
         CommContextManager::CreateNCCLCommContext(
             store, unique_comm_key, rank, world_size);
@@ -164,7 +164,7 @@ bool NeedComputationClipForPP(
 }
 
 Place GetDefaultPlace() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (phi::backends::gpu::GetGPUDeviceCount() >= 0) {
     return paddle::DefaultGPUPlace();
   }
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
index 41cfd4efca8fd7..022dc065980641 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
@@ -71,7 +71,7 @@ std::vector<int64_t> BalancedSplit(int64_t total_nums, int64_t num_of_pieces);
 CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
                                     const std::vector<int64_t>& process_ids);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...)            \
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
@@ -123,7 +123,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
     RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \
   } while (0)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)          \
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
diff --git a/paddle/phi/core/distributed/check/CMakeLists.txt b/paddle/phi/core/distributed/check/CMakeLists.txt
index 964106feac4027..1721a4a4602d10 100644
--- a/paddle/phi/core/distributed/check/CMakeLists.txt
+++ b/paddle/phi/core/distributed/check/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(CHECK_COMMON_SRCS static_check.cc)
 
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   list(APPEND CHECK_COMMON_SRCS nccl_dynamic_check.cc)
 endif()
 
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
index 4a7b931ad2b332..9307af45bd622b 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
@@ -30,16 +30,6 @@
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuFree hipFree
-#elif defined(PADDLE_WITH_MCCL)
-#include <musa_runtime.h>
-
-#include "paddle/phi/backends/dynload/mccl.h"
-
-#define gpuMalloc musaMalloc
-#define gpuMemcpy musaMemcpy
-#define gpuMemcpyDeviceToHost musaMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice musaMemcpyHostToDevice
-#define gpuFree musaFree
 #else
 #include <cuda_runtime.h>
 
@@ -66,7 +56,7 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
 void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
                                      int root_rank,
                                      int cur_rank,
-                                     mcclComm_t comm) {
+                                     ncclComm_t comm) {
   constexpr int kSize = sizeof(int64_t);
   int64_t dtype_host = static_cast<int64_t>(tensor.dtype());
   int64_t* dtype_device;
@@ -74,10 +64,10 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
   PADDLE_ENFORCE_GPU_SUCCESS(
       gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(dtype_device,
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(dtype_device,
                                                          dtype_device,
                                                          1,
-                                                         mcclInt64,
+                                                         ncclInt64,
                                                          root_rank,
                                                          comm,
                                                          kDefaultStream));
@@ -105,7 +95,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
 void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
                                   int root_rank,
                                   int cur_rank,
-                                  mcclComm_t comm) {
+                                  ncclComm_t comm) {
   CheckDataType(tensor, root_rank, cur_rank, comm);
 
   constexpr int kSize = sizeof(int64_t);
@@ -116,10 +106,10 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
   PADDLE_ENFORCE_GPU_SUCCESS(
       gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(shape_device,
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(shape_device,
                                                          shape_device,
                                                          1,
-                                                         mcclInt64,
+                                                         ncclInt64,
                                                          root_rank,
                                                          comm,
                                                          kDefaultStream));
@@ -140,7 +130,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor,
                                   const std::vector<int64_t>& in_size_each_rank,
                                   int cur_rank,
                                   int world_size,
-                                  mcclComm_t comm) {
+                                  ncclComm_t comm) {
   CheckDataType(out_tensor, /*root_rank*/ 0, cur_rank, comm);
   CheckDataType(in_tensor, /*root_rank*/ 0, cur_rank, comm);
 
@@ -153,11 +143,11 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor,
     PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize));
     PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(
         in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduce(in_shape_device,
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(in_shape_device,
                                                         in_shape_device,
                                                         1,
-                                                        mcclInt64,
-                                                        mcclSum,
+                                                        ncclInt64,
+                                                        ncclSum,
                                                         rank,
                                                         comm,
                                                         kDefaultStream));
@@ -177,7 +167,7 @@ void NCCLDynamicCheck::CheckGatherShape(
     int root_rank,
     int cur_rank,
     int world_size,
-    mcclComm_t comm) {
+    ncclComm_t comm) {
   std::vector<int64_t> shapes(world_size, 0);
   shapes[cur_rank] = in_tensor.numel();
   int64_t* in_shape_device;
@@ -188,11 +178,11 @@ void NCCLDynamicCheck::CheckGatherShape(
                                        world_size * sizeof(int64_t),
                                        gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(in_shape_device,
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(in_shape_device,
                                                          in_shape_device,
                                                          world_size,
-                                                         mcclInt64,
-                                                         mcclSum,
+                                                         ncclInt64,
+                                                         ncclSum,
                                                          comm,
                                                          kDefaultStream));
   PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(shapes.data(),
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.h b/paddle/phi/core/distributed/check/nccl_dynamic_check.h
index 502ec886211e1b..23e8386d6f2aff 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.h
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.h
@@ -21,8 +21,6 @@
 
 #if defined(PADDLE_WITH_RCCL)
 using gpuStream_t = hipStream_t;
-#elif defined(PADDLE_WITH_MCCL)
-using gpuStream_t = musaStream_t;
 #else
 using gpuStream_t = cudaStream_t;
 #endif
@@ -38,21 +36,21 @@ struct NCCLDynamicCheck {
   static void CheckDataType(const phi::DenseTensor& tensor,
                             int root_rank,
                             int cur_rank,
-                            mcclComm_t comm);
+                            ncclComm_t comm);
 
   static void CheckShape(const phi::DenseTensor& tensor, int64_t shape);
 
   static void CheckShape(const phi::DenseTensor& tensor,
                          int root_rank,
                          int cur_rank,
-                         mcclComm_t comm);
+                         ncclComm_t comm);
 
   static void CheckShape(const phi::DenseTensor& out_tensor,
                          const phi::DenseTensor& in_tensor,
                          const std::vector<int64_t>& in_size_each_rank,
                          int cur_rank,
                          int world_size,
-                         mcclComm_t comm);
+                         ncclComm_t comm);
 
   // can be used to check gather and all gather
   static void CheckGatherShape(const phi::DenseTensor& in_tensor,
@@ -60,7 +58,7 @@ struct NCCLDynamicCheck {
                                int root_rank,
                                int cur_rank,
                                int world_size,
-                               mcclComm_t comm);
+                               ncclComm_t comm);
 
  private:
   // `0` represents default stream for both cuda & hip
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index 2aee7c7c851042..5fd7861cc52b2d 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -29,7 +29,7 @@
 #include "paddle/phi/core/distributed/store/gloo_store.h"
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -49,13 +49,13 @@ namespace distributed {
 int CommContextManager::device_id = -1;
 
 void CommContextManager::SetDeviceId(int dev_id) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   phi::backends::gpu::SetDeviceId(dev_id);
   CommContextManager::device_id = dev_id;
 #endif
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void CommContextManager::CreateNCCLCommContext(
     const std::shared_ptr<Store>& store,
     const std::string& unique_comm_key,
@@ -67,16 +67,16 @@ void CommContextManager::CreateNCCLCommContext(
   if (comm_context_manager.Has(unique_comm_key)) {
     return;
   }
-  mcclUniqueId nccl_id;
+  ncclUniqueId nccl_id;
   if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetUniqueId(&nccl_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
   }
 
   std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key;
   if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
     std::vector<uint8_t> nccl_id_wrapper(
         reinterpret_cast<uint8_t*>(&nccl_id),
-        reinterpret_cast<uint8_t*>(&nccl_id) + MCCL_UNIQUE_ID_BYTES);
+        reinterpret_cast<uint8_t*>(&nccl_id) + NCCL_UNIQUE_ID_BYTES);
     store->set(unique_key, nccl_id_wrapper);
   } else {
     const auto& nccl_id_wrapper = store->get(unique_key);
@@ -231,8 +231,8 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const {
   return id_to_comm_context_.at(unique_comm_key).get();
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-int CommContextManager::GetRingId(const mcclComm_t& comm) const {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+int CommContextManager::GetRingId(const ncclComm_t& comm) const {
   for (auto iter = id_to_comm_context_.begin();
        iter != id_to_comm_context_.end();
        ++iter) {
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
index 5c3f3101dcada6..8c4d802294986f 100644
--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -24,7 +24,7 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/distributed/comm_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/gpu/forwards.h"
 #endif
 
@@ -57,8 +57,8 @@ class CommContextManager {
 
   CommContext* Get(const std::string& unique_comm_key) const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-  int GetRingId(const mcclComm_t& comm) const;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  int GetRingId(const ncclComm_t& comm) const;
 #endif
 
   bool Has(const std::string& unique_comm_key) const;
@@ -71,7 +71,7 @@ class CommContextManager {
 
   std::vector<int> GetGroupRanks(const std::string& pg_key) const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   static void CreateNCCLCommContext(const std::shared_ptr<Store>& store,
                                     const std::string& unique_comm_key,
                                     int rank,
diff --git a/paddle/phi/core/distributed/comm_task.h b/paddle/phi/core/distributed/comm_task.h
index ca7f8495495d2d..47ba01b980479a 100644
--- a/paddle/phi/core/distributed/comm_task.h
+++ b/paddle/phi/core/distributed/comm_task.h
@@ -25,9 +25,6 @@
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
 #endif
-#if defined(PADDLE_WITH_MCCL)
-#include "paddle/phi/backends/dynload/mccl.h"
-#endif
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif
@@ -46,7 +43,7 @@ class CommTask {
            int gid = 0,
            uint64_t seq = 0,
            int64_t numel = 0,
-           mcclComm_t nccl_comm = nullptr,
+           ncclComm_t nccl_comm = nullptr,
            gpuStream_t nccl_stream = nullptr,
            CommType comm_type = CommType::UNKNOWN)
       : backend_(backend),
@@ -92,7 +89,7 @@ class CommTask {
   std::shared_ptr<Store> GetStore() { return store_; }
   void SetStore(std::shared_ptr<Store> store) { store_ = store; }
 
-  mcclComm_t nccl_comm() { return nccl_comm_; }
+  ncclComm_t nccl_comm() { return nccl_comm_; }
   gpuStream_t nccl_stream() { return nccl_stream_; }
 
   virtual std::string GetTraceMsg() {
@@ -163,7 +160,7 @@ class CommTask {
   int gid_;
   uint64_t seq_{0};
   int64_t numel_;
-  mcclComm_t nccl_comm_;
+  ncclComm_t nccl_comm_;
   gpuStream_t nccl_stream_;
   CommType comm_type_;
   bool start_trace_updated_{false};
diff --git a/paddle/phi/core/distributed/comm_task_manager.cc b/paddle/phi/core/distributed/comm_task_manager.cc
index 822b3892ec3646..ae7de422913587 100644
--- a/paddle/phi/core/distributed/comm_task_manager.cc
+++ b/paddle/phi/core/distributed/comm_task_manager.cc
@@ -32,7 +32,7 @@
 #include "paddle/phi/core/distributed/store/store.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/comm_task_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index 4600d2e14cdbbe..8da676e74d911a 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -30,16 +30,16 @@ namespace distributed {
 // set this flag to `true` and recompile to enable dynamic checks
 constexpr bool FLAGS_enable_nccl_dynamic_check = false;
 
-NCCLCommContext::NCCLCommContext(int rank, int size, mcclUniqueId nccl_id)
+NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
     : CommContext(rank, size) {
-  MCCL_CHECK(
-      phi::dynload::mcclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
-  MCCL_CHECK(phi::dynload::mcclGetVersion(&nccl_version_));
+  NCCL_CHECK(
+      phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
+  NCCL_CHECK(phi::dynload::ncclGetVersion(&nccl_version_));
 }
 
 int NCCLCommContext::GetNcclVersion() { return nccl_version_; }
 
-mcclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
+ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
 
 gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); }
 
@@ -77,7 +77,7 @@ void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
   if (FLAGS_enable_nccl_dynamic_check) {
     NCCLDynamicCheck::CheckShape(*out_tensor, root, rank_, nccl_comm_);
   }
-  MCCL_CHECK(phi::dynload::mcclBroadcast(in_tensor.data(),
+  NCCL_CHECK(phi::dynload::ncclBroadcast(in_tensor.data(),
                                          out_tensor->data(),
                                          in_tensor.numel(),
                                          ToNCCLDataType(in_tensor.type()),
@@ -100,7 +100,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  MCCL_CHECK(phi::dynload::mcclAllGather(in_tensor.data(),
+  NCCL_CHECK(phi::dynload::ncclAllGather(in_tensor.data(),
                                          out_tensor->data(),
                                          in_tensor.numel(),
                                          ToNCCLDataType(in_tensor.type()),
@@ -109,7 +109,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
 }
 void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
                                     const phi::DenseTensor& in_tensor,
-                                    mcclRedOp_t reduce_type,
+                                    ncclRedOp_t reduce_type,
                                     gpuStream_t stream) {
   phi::distributed::CommStaticCheck::ScatterLikeShape(*out_tensor,
                                                       in_tensor,
@@ -122,7 +122,7 @@ void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  MCCL_CHECK(phi::dynload::mcclReduceScatter(in_tensor.data(),
+  NCCL_CHECK(phi::dynload::ncclReduceScatter(in_tensor.data(),
                                              out_tensor->data(),
                                              out_tensor->numel(),
                                              ToNCCLDataType(in_tensor.type()),
@@ -141,7 +141,7 @@ void NCCLCommContext::Send(const phi::DenseTensor& in_tensor,
     NCCLDynamicCheck::CheckShape(in_tensor, rank_, rank_, nccl_comm_);
   }
 
-  MCCL_CHECK(phi::dynload::mcclSend(in_tensor.data(),
+  NCCL_CHECK(phi::dynload::ncclSend(in_tensor.data(),
                                     count,
                                     ToNCCLDataType(in_tensor.dtype()),
                                     peer,
@@ -160,7 +160,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor,
     NCCLDynamicCheck::CheckShape(*out_tensor, peer, rank_, nccl_comm_);
   }
 
-  MCCL_CHECK(phi::dynload::mcclRecv(out_tensor->data(),
+  NCCL_CHECK(phi::dynload::ncclRecv(out_tensor->data(),
                                     count,
                                     ToNCCLDataType(out_tensor->dtype()),
                                     peer,
@@ -172,7 +172,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor,
 
 void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
                                 const phi::DenseTensor& in_tensor,
-                                mcclRedOp_t reduce_type,
+                                ncclRedOp_t reduce_type,
                                 gpuStream_t stream) {
   phi::distributed::CommStaticCheck::SameShape(*out_tensor,
                                                in_tensor,
@@ -185,7 +185,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  MCCL_CHECK(phi::dynload::mcclAllReduce(in_tensor.data(),
+  NCCL_CHECK(phi::dynload::ncclAllReduce(in_tensor.data(),
                                          out_tensor->data(),
                                          in_tensor.numel(),
                                          ToNCCLDataType(in_tensor.type()),
@@ -196,7 +196,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
 
 void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
                              const phi::DenseTensor& in_tensor,
-                             mcclRedOp_t reduce_type,
+                             ncclRedOp_t reduce_type,
                              int root,
                              gpuStream_t stream) {
   phi::distributed::CommStaticCheck::SameShape(*out_tensor,
@@ -210,7 +210,7 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  MCCL_CHECK(phi::dynload::mcclReduce(in_tensor.data(),
+  NCCL_CHECK(phi::dynload::ncclReduce(in_tensor.data(),
                                       out_tensor->data(),
                                       in_tensor.numel(),
                                       ToNCCLDataType(in_tensor.type()),
@@ -221,23 +221,23 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
 }
 
 void NCCLCommContext::GroupStart() {
-  MCCL_CHECK(phi::dynload::mcclGroupStart());
+  NCCL_CHECK(phi::dynload::ncclGroupStart());
 }
-void NCCLCommContext::GroupEnd() { MCCL_CHECK(phi::dynload::mcclGroupEnd()); }
+void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); }
 
-// #if NCCL_VERSION_CODE >= 21100
-void NCCLCommContext::RedOpCreatePreMulSum(mcclRedOp_t* op,
+#if NCCL_VERSION_CODE >= 21100
+void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op,
                                            void* scalar,
-                                           mcclDataType_t dtype,
-                                           mcclScalarResidence_t residence) {
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum(
+                                           ncclDataType_t dtype,
+                                           ncclScalarResidence_t residence) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
       op, scalar, dtype, residence, nccl_comm_));
 }
 
-void NCCLCommContext::RedOpDestroy(mcclRedOp_t op) {
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, nccl_comm_));
+void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_));
 }
-// #endif
+#endif
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h
index e7a73f12046721..609b5e0defe079 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.h
+++ b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -18,11 +18,6 @@
 #include <cuda_runtime.h>
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -34,8 +29,6 @@
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
-#elif defined(PADDLE_WITH_MCCL)
-#include "paddle/phi/backends/dynload/mccl.h"
 #else
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif
@@ -46,12 +39,12 @@ namespace distributed {
 
 class NCCLCommContext final : public CommContext {
  public:
-  NCCLCommContext(int rank, int size, mcclUniqueId nccl_id);
+  NCCLCommContext(int rank, int size, ncclUniqueId nccl_id);
   ~NCCLCommContext() override = default;
 
   int GetNcclVersion();
 
-  mcclComm_t GetNcclComm();
+  ncclComm_t GetNcclComm();
 
   gpuStream_t GetStream();
 
@@ -87,7 +80,7 @@ class NCCLCommContext final : public CommContext {
 
   void ReduceScatter(phi::DenseTensor* out_tensor,
                      const phi::DenseTensor& in_tensor,
-                     mcclRedOp_t reduce_type,
+                     ncclRedOp_t reduce_type,
                      gpuStream_t stream);
 
   void AllGather(phi::DenseTensor* out_tensor,
@@ -96,12 +89,12 @@ class NCCLCommContext final : public CommContext {
 
   void AllReduce(phi::DenseTensor* out_tensor,
                  const phi::DenseTensor& in_tensor,
-                 mcclRedOp_t reduce_type,
+                 ncclRedOp_t reduce_type,
                  gpuStream_t stream);
 
   void Reduce(phi::DenseTensor* out_tensor,
               const phi::DenseTensor& in_tensor,
-              mcclRedOp_t reduce_type,
+              ncclRedOp_t reduce_type,
               int root,
               gpuStream_t stream);
 
@@ -109,25 +102,25 @@ class NCCLCommContext final : public CommContext {
 
   void GroupEnd();
 
-// #if NCCL_VERSION_CODE >= 21100
+#if NCCL_VERSION_CODE >= 21100
   // Creates a new reduction operator which pre-multiplies input values by a
   // given scalar locally before reducing them with peer values via summation.
-  void RedOpCreatePreMulSum(mcclRedOp_t* op,
+  void RedOpCreatePreMulSum(ncclRedOp_t* op,
                             void* scalar,
-                            mcclDataType_t dtype,
-                            mcclScalarResidence_t residence);
+                            ncclDataType_t dtype,
+                            ncclScalarResidence_t residence);
 
   // Destroys the reduction operator op. The operator must have been created by
   // ncclRedOpCreatePreMul with the matching communicator comm.
-  void RedOpDestroy(mcclRedOp_t op);
-// #endif
+  void RedOpDestroy(ncclRedOp_t op);
+#endif
 
  private:
   DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
 
   int nccl_version_;
 
-  mcclComm_t nccl_comm_;
+  ncclComm_t nccl_comm_;
 
   std::unique_ptr<phi::GPUContext> dev_ctx_;
 
diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc
index 5f11c8101df938..4e2efea0068eb9 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.cc
+++ b/paddle/phi/core/distributed/nccl_comm_task.cc
@@ -33,7 +33,7 @@ NCCLCommTask::NCCLCommTask(const phi::Place& place,
                            int64_t numel,
                            bool sync_op,
                            bool use_calc_stream,
-                           mcclComm_t nccl_comm,
+                           ncclComm_t nccl_comm,
                            gpuStream_t stream,
                            CommType comm_type,
                            int64_t timeout)
@@ -62,8 +62,6 @@ void NCCLCommTask::StartRecord() {
   if (!start_event_created_) {
 #ifdef PADDLE_WITH_CUDA
     CUDA_CHECK(cudaEventCreateWithFlags(&nccl_start_event_, cuda_event_flags_));
-#elif defined(PADDLE_WITH_MUSA)
-    MUSA_CHECK(musaEventCreateWithFlags(&nccl_start_event_, musa_event_flags_));
 #else  // PADDLE_WITH_HIP
     HIP_CHECK(hipEventCreateWithFlags(&nccl_start_event_, hip_event_flags_));
 #endif
@@ -71,8 +69,6 @@ void NCCLCommTask::StartRecord() {
   }
 #ifdef PADDLE_WITH_CUDA
   CUDA_CHECK(cudaEventRecord(nccl_start_event_, nccl_stream_));
-#elif defined(PADDLE_WITH_MUSA)
-  MUSA_CHECK(musaEventRecord(nccl_start_event_, nccl_stream_));
 #else  // PADDLE_WITH_HIP
   HIP_CHECK(hipEventRecord(nccl_start_event_, nccl_stream_));
 #endif
@@ -82,8 +78,6 @@ void NCCLCommTask::EndRecord() {
   if (!end_event_created_) {
 #ifdef PADDLE_WITH_CUDA
     CUDA_CHECK(cudaEventCreateWithFlags(&nccl_end_event_, cuda_event_flags_));
-#elif defined(PADDLE_WITH_MUSA)
-    MUSA_CHECK(musaEventCreateWithFlags(&nccl_end_event_, musa_event_flags_));
 #else  // PADDLE_WITH_HIP
     HIP_CHECK(hipEventCreateWithFlags(&nccl_end_event_, hip_event_flags_));
 #endif
@@ -91,8 +85,6 @@ void NCCLCommTask::EndRecord() {
   }
 #ifdef PADDLE_WITH_CUDA
   CUDA_CHECK(cudaEventRecord(nccl_end_event_, nccl_stream_));
-#elif defined(PADDLE_WITH_MUSA)
-  MUSA_CHECK(musaEventRecord(nccl_end_event_, nccl_stream_));
 #else  // PADDLE_WITH_HIP
   HIP_CHECK(hipEventRecord(nccl_end_event_, nccl_stream_));
 #endif
@@ -111,19 +103,6 @@ void NCCLCommTask::ClearRecord() {
     end_event_created_ = false;
   }
 }
-#elif defined(PADDLE_WITH_MUSA)
-void NCCLCommTask::ClearRecord() {
-  if (start_event_created_) {
-    backends::gpu::GPUDeviceGuard guard(place_.device);
-    MUSA_CHECK(musaEventDestroy(nccl_start_event_));
-    start_event_created_ = false;
-  }
-  if (end_event_created_) {
-    backends::gpu::GPUDeviceGuard guard(place_.device);
-    MUSA_CHECK(musaEventDestroy(nccl_end_event_));
-    end_event_created_ = false;
-  }
-}
 #else  // PADDLE_WITH_HIP
 void NCCLCommTask::ClearRecord() {
   if (start_event_created_) {
@@ -150,16 +129,6 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
     // ignore and clear the error if not ready
     CUDA_CHECK(cudaGetLastError());
   }
-#elif defined(PADDLE_WITH_MUSA)
-  musaError_t ret = musaEventQuery(event);
-  if (ret == musaSuccess) {
-    return true;
-  } else if (ret != musaErrorNotReady) {
-    MUSA_CHECK(ret);
-  } else {
-    // ignore and clear the error if not ready
-    MUSA_CHECK(musaGetLastError());
-  }  
 #else  // PADDLE_WITH_HIP
   hipError_t ret = hipEventQuery(event);
   if (ret == hipSuccess) {
@@ -174,7 +143,7 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
   return false;
 }
 
-std::string GetNCCLErrorDetail(mcclResult_t result) {
+std::string GetNCCLErrorDetail(ncclResult_t result) {
   std::string detail;
   std::string last_error;
 #ifdef ENABLE_NCCL_GET_LAST_ERROR
@@ -182,10 +151,10 @@ std::string GetNCCLErrorDetail(mcclResult_t result) {
       ", Last error: " + std::string(phi::dynload::ncclGetLastError(NULL));
 #endif
   switch (result) {
-    case mcclUnhandledCudaError:
+    case ncclUnhandledCudaError:
       detail = "ncclUnhandledCudaError: Call to CUDA function failed.";
       break;
-    case mcclSystemError:
+    case ncclSystemError:
       detail =
           "ncclSystemError: System call (e.g. socket, malloc) or external "
           "library call failed or device error. ";
@@ -195,13 +164,13 @@ std::string GetNCCLErrorDetail(mcclResult_t result) {
       detail += "It can be also caused by unexpected exit of a remote peer.";
 #endif
       break;
-    case mcclInternalError:
+    case ncclInternalError:
       detail = "ncclInternalError: Internal check failed.";
       break;
-    case mcclInvalidArgument:
+    case ncclInvalidArgument:
       detail = "ncclInvalidArgument: Invalid value for an argument.";
       break;
-    case mcclInvalidUsage:
+    case ncclInvalidUsage:
       detail =
           "ncclInvalidUsage: This usually reflects invalid usage of NCCL "
           "library.";
@@ -225,10 +194,10 @@ std::string NCCLCommTask::GetCommErrors() {
     return comm_error_;
   }
 
-  mcclResult_t nccl_async_error;
-  MCCL_CHECK(
-      phi::dynload::mcclCommGetAsyncError(nccl_comm_, &nccl_async_error));
-  if (nccl_async_error != mcclSuccess) {
+  ncclResult_t nccl_async_error;
+  NCCL_CHECK(
+      phi::dynload::ncclCommGetAsyncError(nccl_comm_, &nccl_async_error));
+  if (nccl_async_error != ncclSuccess) {
     comm_error_ =
         "\n\t Find nccl comm error: " + GetNCCLErrorDetail(nccl_async_error);
   }
@@ -272,7 +241,7 @@ void NCCLCommTask::AbortComm() {
   if (aborted_) {
     return;
   }
-  MCCL_CHECK(phi::dynload::mcclCommAbort(nccl_comm_));
+  NCCL_CHECK(phi::dynload::ncclCommAbort(nccl_comm_));
 
   aborted_ = true;
   nccl_comm_ = nullptr;
diff --git a/paddle/phi/core/distributed/nccl_comm_task.h b/paddle/phi/core/distributed/nccl_comm_task.h
index 11bbbd1c9dcf70..fca9004cf0b2d4 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.h
+++ b/paddle/phi/core/distributed/nccl_comm_task.h
@@ -21,8 +21,6 @@
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
-#elif defined(PADDLE_WITH_MCCL)
-#include "paddle/phi/backends/dynload/mccl.h"
 #else
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif
@@ -44,7 +42,7 @@ class NCCLCommTask : public CommTask {
                int64_t numel = 0,
                bool sync_op = true,
                bool use_calc_stream = false,
-               mcclComm_t = nullptr,
+               ncclComm_t = nullptr,
                gpuStream_t = nullptr,
                CommType comm_type = CommType::UNKNOWN,
                int64_t timeout = DefaultTimeout);
@@ -73,8 +71,6 @@ class NCCLCommTask : public CommTask {
 
 #ifdef PADDLE_WITH_CUDA
   unsigned int cuda_event_flags_ = cudaEventDisableTiming;
-#elif defined(PADDLE_WITH_MUSA)
-  unsigned int musa_event_flags_ = musaEventDisableTiming;
 #else  // PADDLE_WITH_HIP
   unsigned int hip_event_flags_ = hipEventDisableTiming;
 #endif
diff --git a/paddle/phi/core/distributed/nccl_tools.cc b/paddle/phi/core/distributed/nccl_tools.cc
index 24a1f3ee7891d1..a5388796d1f45b 100644
--- a/paddle/phi/core/distributed/nccl_tools.cc
+++ b/paddle/phi/core/distributed/nccl_tools.cc
@@ -19,74 +19,74 @@
 #include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
 
-// #if NCCL_VERSION_CODE >= 21300
+#if NCCL_VERSION_CODE >= 21300
 #define ENABLE_NCCL_GET_LAST_ERROR
 #define NCCL_REMOTE_ERROR
-// #endif
+#endif
 
 namespace phi {
 namespace distributed {
 
-mcclRedOp_t ToNCCLRedType(ReduceOp reduction) {
-  static const std::unordered_map<ReduceOp, mcclRedOp_t> red_type = {
-      {ReduceOp::MIN, mcclMin},
-      {ReduceOp::MAX, mcclMax},
-      {ReduceOp::SUM, mcclSum},
-      {ReduceOp::PRODUCT, mcclProd},
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::unordered_map<ReduceOp, ncclRedOp_t> red_type = {
+      {ReduceOp::MIN, ncclMin},
+      {ReduceOp::MAX, ncclMax},
+      {ReduceOp::SUM, ncclSum},
+      {ReduceOp::PRODUCT, ncclProd},
   };
   auto it = red_type.find(reduction);
   PADDLE_ENFORCE_EQ(it != red_type.end(),
                     true,
                     phi::errors::InvalidArgument(
-                        "Invalid nccl reduction. Must be mcclMin | mcclMax | "
-                        "mcclProd | mcclSum"));
+                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
+                        "ncclProd | ncclSum"));
   return it->second;
 }
 
-std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID) {
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
   const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
   std::ostringstream oss;
-  for (auto i = 0; i < MCCL_UNIQUE_ID_BYTES; ++i) {
+  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
     oss << std::hex << static_cast<int>(bytes[i]);
   }
   return oss.str();
 }
 
-std::string NCCLDTypeToString(mcclDataType_t dtype) {
+std::string NCCLDTypeToString(ncclDataType_t dtype) {
 #define PD_NCCL_DTYPE_TO_STR(__nccl_dtype, __str_dtype) \
   if (dtype == __nccl_dtype) return __str_dtype;
-  PD_NCCL_DTYPE_TO_STR(mcclFloat, "float32");
-  PD_NCCL_DTYPE_TO_STR(mcclFloat32, "float32");
-  PD_NCCL_DTYPE_TO_STR(mcclHalf, "float16");
-  PD_NCCL_DTYPE_TO_STR(mcclFloat16, "float16");
-// // #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
-//   PD_NCCL_DTYPE_TO_STR(mcclBfloat16, "bfloat16");
-// // #endif
-  PD_NCCL_DTYPE_TO_STR(mcclDouble, "float64");
-  PD_NCCL_DTYPE_TO_STR(mcclFloat64, "float64");
+  PD_NCCL_DTYPE_TO_STR(ncclFloat, "float32");
+  PD_NCCL_DTYPE_TO_STR(ncclFloat32, "float32");
+  PD_NCCL_DTYPE_TO_STR(ncclHalf, "float16");
+  PD_NCCL_DTYPE_TO_STR(ncclFloat16, "float16");
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+  PD_NCCL_DTYPE_TO_STR(ncclBfloat16, "bfloat16");
+#endif
+  PD_NCCL_DTYPE_TO_STR(ncclDouble, "float64");
+  PD_NCCL_DTYPE_TO_STR(ncclFloat64, "float64");
 
-  PD_NCCL_DTYPE_TO_STR(mcclInt8, "int8");
-  PD_NCCL_DTYPE_TO_STR(mcclChar, "int8");
-  PD_NCCL_DTYPE_TO_STR(mcclUint8, "uint8");
-  PD_NCCL_DTYPE_TO_STR(mcclInt32, "int32");
-  PD_NCCL_DTYPE_TO_STR(mcclInt, "int32");
-  PD_NCCL_DTYPE_TO_STR(mcclUint32, "uint32");
-  PD_NCCL_DTYPE_TO_STR(mcclInt64, "int64");
-  PD_NCCL_DTYPE_TO_STR(mcclUint64, "uint64");
+  PD_NCCL_DTYPE_TO_STR(ncclInt8, "int8");
+  PD_NCCL_DTYPE_TO_STR(ncclChar, "int8");
+  PD_NCCL_DTYPE_TO_STR(ncclUint8, "uint8");
+  PD_NCCL_DTYPE_TO_STR(ncclInt32, "int32");
+  PD_NCCL_DTYPE_TO_STR(ncclInt, "int32");
+  PD_NCCL_DTYPE_TO_STR(ncclUint32, "uint32");
+  PD_NCCL_DTYPE_TO_STR(ncclInt64, "int64");
+  PD_NCCL_DTYPE_TO_STR(ncclUint64, "uint64");
 
 #undef PD_NCCL_DTYPE_TO_STR
   PADDLE_THROW(phi::errors::InvalidArgument(
       "This datatype %d in nccl is not supported.", static_cast<int>(dtype)));
 }
 
-std::string NCCLRedTypeToString(mcclRedOp_t op) {
-  if (op == mcclSum) return "SUM";
-  if (op == mcclProd) return "PROD";
-  if (op == mcclMin) return "MIN";
-  if (op == mcclMax) return "MAX";
-// #if NCCL_VERSION_CODE >= 21000
-  if (op == mcclAvg) return "AVG";
-// #endif
+std::string NCCLRedTypeToString(ncclRedOp_t op) {
+  if (op == ncclSum) return "SUM";
+  if (op == ncclProd) return "PROD";
+  if (op == ncclMin) return "MIN";
+  if (op == ncclMax) return "MAX";
+#if NCCL_VERSION_CODE >= 21000
+  if (op == ncclAvg) return "AVG";
+#endif
   return "UDF_" + std::to_string(op);
 }
 
diff --git a/paddle/phi/core/distributed/nccl_tools.h b/paddle/phi/core/distributed/nccl_tools.h
index e256d4ef4d0093..0ab380a4177838 100644
--- a/paddle/phi/core/distributed/nccl_tools.h
+++ b/paddle/phi/core/distributed/nccl_tools.h
@@ -21,9 +21,6 @@
 #ifdef PADDLE_WITH_RCCL
 #include <hip/hip_runtime.h>
 #include "paddle/phi/backends/dynload/rccl.h"
-#elif defined(PADDLE_WITH_MCCL)
-#include <musa_runtime.h>
-#include "paddle/phi/backends/dynload/mccl.h"
 #else
 #include <cuda_runtime.h>
 #include "paddle/phi/backends/dynload/nccl.h"
@@ -35,7 +32,7 @@ namespace distributed {
 #define NCCL_CHECK(cmd)                                                \
   do {                                                                 \
     ncclResult_t r = cmd;                                              \
-    if (r != mcclSuccess) {                                            \
+    if (r != ncclSuccess) {                                            \
       PADDLE_THROW(                                                    \
           phi::errors::External("Failed, NCCL error %s:%d '%s'\n",     \
                                 __FILE__,                              \
@@ -44,18 +41,6 @@ namespace distributed {
     }                                                                  \
   } while (0)
 
-#define MCCL_CHECK(cmd)                                                \
-  do {                                                                 \
-    mcclResult_t r = cmd;                                              \
-    if (r != mcclSuccess) {                                            \
-      PADDLE_THROW(                                                    \
-          phi::errors::External("Failed, MCCL error %s:%d '%s'\n",     \
-                                __FILE__,                              \
-                                __LINE__,                              \
-                                phi::dynload::mcclGetErrorString(r))); \
-    }                                                                  \
-  } while (0)
-
 #ifdef PADDLE_WITH_NCCL
 #define CUDA_CHECK(expr)                                                    \
   do {                                                                      \
@@ -67,17 +52,6 @@ namespace distributed {
                                          cudaGetErrorString(r)));           \
     }                                                                       \
   } while (0)
-#elif defined(PADDLE_WITH_MCCL)
-#define MUSA_CHECK(expr)                                                    \
-  do {                                                                      \
-    musaError_t r = expr;                                                   \
-    if (r != musaSuccess) {                                                 \
-      PADDLE_THROW(phi::errors::External("Failed, musa error %s:%d '%s'\n", \
-                                         __FILE__,                          \
-                                         __LINE__,                          \
-                                         musaGetErrorString(r)));           \
-    }                                                                       \
-  } while (0)
 #else  // PADDLE_WITH_RCCL
 #define HIP_CHECK(expr)                                                    \
   do {                                                                     \
@@ -91,13 +65,13 @@ namespace distributed {
   } while (0)
 #endif
 
-mcclRedOp_t ToNCCLRedType(ReduceOp reduction);
+ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
 
-std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID);
+std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID);
 
-std::string NCCLDTypeToString(mcclDataType_t dtype);
+std::string NCCLDTypeToString(ncclDataType_t dtype);
 
-std::string NCCLRedTypeToString(mcclRedOp_t op);
+std::string NCCLRedTypeToString(ncclRedOp_t op);
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 0c21ffac88703f..61e502951f24ee 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -23,16 +23,6 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
-#ifdef PADDLE_WITH_MUSA
-#include <mublas.h>
-#include <mudnn.h>
-#include <mufft.h>
-#include <murand.h>
-#include <musparse.h>
-#include <thrust/system/musa/error.h>
-#include <thrust/system_error.h>
-#endif 
-
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
@@ -65,17 +55,6 @@ limitations under the License. */
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/mufft.h"
-#include "paddle/phi/backends/dynload/mublas.h"
-#include "paddle/phi/backends/dynload/mudnn.h"
-#include "paddle/phi/backends/dynload/murand.h"
-#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
-#include <error.h>
-#include "paddle/phi/backends/dynload/mccl.h"
-#endif  // __APPLE__
-#endif 
-
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hipfft.h"
 #include "paddle/phi/backends/dynload/hiprand.h"
@@ -90,7 +69,7 @@ limitations under the License. */
 // Note: these headers for simplify demangle type string
 #include "paddle/phi/core/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_types.h"
 #endif
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -347,17 +326,6 @@ struct EnforceNotMet : public std::exception {
       abort();                                                     \
     }                                                              \
   } while (0)
-#elif defined(__MUSACC__)
-#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
-  do {                                                             \
-    if (!(_IS_NOT_ERROR)) {                                        \
-      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
-             __FILE__,                                             \
-             __LINE__,                                             \
-             #_IS_NOT_ERROR,                                       \
-             ##__VA_ARGS__);                                       \
-    }                                                              \
-  } while (0)
 #else
 #define PADDLE_ENFORCE(COND, ...)                               \
   do {                                                          \
@@ -602,7 +570,7 @@ DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS);
 DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess);
+DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
 #endif
 
 }  // namespace details
@@ -698,7 +666,7 @@ inline std::string build_nvidia_error_msg(CUresult stat) {
 /**************** NCCL ERROR ****************/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
-  return nccl_result != mcclSuccess;
+  return nccl_result != ncclSuccess;
 }
 
 inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
@@ -899,7 +867,7 @@ inline std::string build_rocm_error_msg(rocblas_status stat) {
 /****** RCCL ERROR ******/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 inline bool is_error(ncclResult_t nccl_result) {
-  return nccl_result != mcclSuccess;
+  return nccl_result != ncclSuccess;
 }
 
 inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
@@ -935,7 +903,7 @@ DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success);
 DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess);
+DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
 #endif
 
 }  // namespace details
@@ -990,7 +958,7 @@ inline void retry_sleep(unsigned millisecond) {
     }                                                                   \
     if (UNLIKELY(__cond__ != __success_type__)) {                       \
       auto __summary__ = phi::errors::External(                         \
-          ::phi::enforce::build_musa_error_msg(__cond__));              \
+          ::phi::enforce::build_rocm_error_msg(__cond__));              \
       __THROW_ERROR_INTERNAL__(__summary__);                            \
     }                                                                   \
   } while (0)
@@ -998,234 +966,6 @@ inline void retry_sleep(unsigned millisecond) {
 #undef DEFINE_EXTERNAL_API_TYPE
 #endif  // PADDLE_WITH_HIP
 
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-/**************************************************************************/
-/***************************** MUSA ERROR **********************************/
-#ifdef PADDLE_WITH_MUSA
-
-/***** MUSA ERROR *****/
-inline bool is_error(musaError_t e) { return e != musaSuccess; }
-
-inline std::string build_musa_error_msg(musaError_t e) {
-  std::ostringstream sout;
-  sout << " Musa error(" << e << "), " << musaGetErrorString(e) << ".";
-  return sout.str();
-}
-
-/***** MURAND ERROR *****/
-inline bool is_error(murandStatus_t stat) {
-  return stat != MURAND_STATUS_SUCCESS;
-}
-
-inline const char* murandGetErrorString(murandStatus_t stat) {
-  switch (stat) {
-    case MURAND_STATUS_SUCCESS:
-      return "MURAND_STATUS_SUCCESS";
-    case MURAND_STATUS_VERSION_MISMATCH:
-      return "MURAND_STATUS_VERSION_MISMATCH";
-    case MURAND_STATUS_NOT_CREATED:
-      return "MURAND_STATUS_NOT_CREATED";
-    case MURAND_STATUS_ALLOCATION_FAILED:
-      return "MURAND_STATUS_ALLOCATION_FAILED";
-    case MURAND_STATUS_TYPE_ERROR:
-      return "MURAND_STATUS_TYPE_ERROR";
-    case MURAND_STATUS_OUT_OF_RANGE:
-      return "MURAND_STATUS_OUT_OF_RANGE";   
-    case MURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "MURAND_STATUS_LENGTH_NOT_MULTIPLE";   
-    case MURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "MURAND_STATUS_DOUBLE_PRECISION_REQUIRED";                
-    case MURAND_STATUS_LAUNCH_FAILURE:
-      return "MURAND_STATUS_LAUNCH_FAILURE";          
-    case MURAND_STATUS_INTERNAL_ERROR:
-      return "MURAND_STATUS_INTERNAL_ERROR";      
-    case MURAND_STATUS_NOT_IMPLEMENTED:
-      return "MURAND_STATUS_NOT_IMPLEMENTED";                    
-    default:
-      return "Unknown murand status";
-  }
-}
-
-inline std::string build_musa_error_msg(murandStatus_t stat) {
-  std::string msg(" Murand error, ");
-  return msg + murandGetErrorString(stat) + " ";
-}
-
-/***** mudnn ERROR *****/
-// inline bool is_error(mudnnStatus_t stat) {
-//   return stat != cudnnStatusSuccess;
-// }
-
-// inline std::string build_rocm_error_msg(miopenStatus_t stat) {
-//   std::string msg(" Miopen error, ");
-//   return msg + phi::dynload::miopenGetErrorString(stat) + " ";
-// }
-
-/***** MUBLAS ERROR *****/
-inline bool is_error(mublasStatus stat) {
-  return stat != MUBLAS_STATUS_SUCCESS;
-}
-
-inline const char* mublasGetErrorString(mublasStatus stat) {
-  switch (stat) {
-    case MUBLAS_STATUS_SUCCESS:
-      return "MUBLAS_STATUS_SUCCESS";
-    case MUBLAS_STATUS_INVALID_HANDLE:
-      return "MUBLAS_STATUS_INVALID_HANDLE";
-    case MUBLAS_STATUS_NOT_IMPLEMENTED:
-      return "MUBLAS_STATUS_NOT_IMPLEMENTED";
-    case MUBLAS_STATUS_INVALID_POINTER:
-      return "MUBLAS_STATUS_INVALID_POINTER";
-    case MUBLAS_STATUS_INVALID_SIZE:
-      return "MUBLAS_STATUS_INVALID_SIZE";    
-    case MUBLAS_STATUS_MEMORY_ERROR:
-      return "MUBLAS_STATUS_MEMORY_ERROR";
-    case MUBLAS_STATUS_INTERNAL_ERROR:
-      return "MUBLAS_STATUS_INTERNAL_ERROR";
-    case MUBLAS_STATUS_PERF_DEGRADED:
-      return "MUBLAS_STATUS_PERF_DEGRADED";
-    case MUBLAS_STATUS_SIZE_QUERY_MISMATCH:
-      return "MUBLAS_STATUS_SIZE_QUERY_MISMATCH";
-    case MUBLAS_STATUS_SIZE_INCREASED:
-      return "MUBLAS_STATUS_SIZE_INCREASED";     
-    case MUBLAS_STATUS_SIZE_UNCHANGED:
-      return "MUBLAS_STATUS_SIZE_UNCHANGED";       
-    case MUBLAS_STATUS_INVALID_VALUE:
-      return "MUBLAS_STATUS_INVALID_VALUE";       
-    case MUBLAS_STATUS_CONTINUE:
-      return "MUBLAS_STATUS_CONTINUE";       
-    case MUBLAS_STATUS_CHECK_NUMERICS_FAIL:
-      return "MUBLAS_STATUS_CHECK_NUMERICS_FAIL";                                                                      
-    default:
-      return "Unknown mublas status";
-  }
-}
-
-inline std::string build_musa_error_msg(mublasStatus stat) {
-  std::string msg(" mublas error, ");
-  return msg + mublasGetErrorString(stat) + " ";
-}
-
-/****** MCCL ERROR ******/
-#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
-inline bool is_error(mcclResult_t mccl_result) {
-  return mccl_result != mcclSuccess;
-}
-
-inline std::string build_musa_error_msg(mcclResult_t mccl_result) {
-  std::string msg(" Mccl error, ");
-  return msg + phi::dynload::mcclGetErrorString(mccl_result) + " ";
-}
-#endif  // not(__APPLE__) and PADDLE_WITH_MCCL
-
-/***** MUFFT ERROR *****/
-inline bool is_error(mufftResult_t stat) { return stat != MUFFT_SUCCESS; }
-
-inline std::string build_musa_error_msg(mufftResult_t stat) {
-  std::string msg(" MUFFT error, ");
-  return msg + phi::dynload::mufftGetErrorString(stat) + " ";
-}
-
-namespace details {
-
-template <typename T>
-struct ExternalApiType {};
-
-#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
-  template <>                                         \
-  struct ExternalApiType<type> {                      \
-    using Type = type;                                \
-    static constexpr Type kSuccess = success_value;   \
-  }
-
-DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess);
-DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(mublasStatus, MUBLAS_STATUS_SUCCESS);
-DEFINE_EXTERNAL_API_TYPE(mufftResult_t, MUFFT_SUCCESS);
-
-#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
-DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess);
-#endif
-
-}  // namespace details
-
-#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                   \
-  do {                                                     \
-    auto __cond__ = (COND);                                \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
-    constexpr auto __success_type__ =                      \
-        ::phi::enforce::details::ExternalApiType<          \
-            __CUDA_STATUS_TYPE__>::kSuccess;               \
-    if (UNLIKELY(__cond__ != __success_type__)) {          \
-      auto __summary__ = phi::errors::External(            \
-          ::phi::enforce::build_musa_error_msg(__cond__)); \
-      __THROW_ERROR_INTERNAL__(__summary__);               \
-    }                                                      \
-  } while (0)
-
-#define PADDLE_WARN_GPU_SUCCESS(COND)                      \
-  do {                                                     \
-    auto __cond__ = (COND);                                \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
-    constexpr auto __success_type__ =                      \
-        ::phi::enforce::details::ExternalApiType<          \
-            __CUDA_STATUS_TYPE__>::kSuccess;               \
-    if (UNLIKELY(__cond__ != __success_type__)) {          \
-      ::phi::enforce::ThrowWarnInternal(                   \
-          ::phi::enforce::build_musa_error_msg(__cond__)); \
-    }                                                      \
-  } while (0)
-
-inline void retry_sleep(unsigned millisecond) {
-#ifdef _WIN32
-  Sleep(millisecond);
-#else
-  sleep(millisecond);
-#endif
-}
-
-#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
-  do {                                                                  \
-    auto __cond__ = (COND);                                             \
-    int retry_count = 1;                                                \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
-    constexpr auto __success_type__ =                                   \
-        ::phi::enforce::details::ExternalApiType<                       \
-            __CUDA_STATUS_TYPE__>::kSuccess;                            \
-    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
-      ::phi::enforce::retry_sleep(10000);                               \
-      __cond__ = (COND);                                                \
-      ++retry_count;                                                    \
-    }                                                                   \
-    if (UNLIKELY(__cond__ != __success_type__)) {                       \
-      auto __summary__ = phi::errors::External(                         \
-          ::phi::enforce::build_musa_error_msg(__cond__));              \
-      __THROW_ERROR_INTERNAL__(__summary__);                            \
-    }                                                                   \
-  } while (0)
-
-#undef DEFINE_EXTERNAL_API_TYPE
-#endif  // PADDLE_WITH_MUSA
-
-
-
-
-
-
 }  // namespace enforce
 using namespace enforce;  // NOLINT
 }  // namespace phi
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index 9304b42be1644a..a6764dfcf1c31f 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -14,7 +14,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/flags.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
 #endif
 
@@ -120,7 +120,7 @@ PHI_DEFINE_EXPORTED_bool(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 /**
  * CUDA related related FLAG
@@ -215,7 +215,7 @@ PHI_DEFINE_EXPORTED_bool(
     true,
     "Whether enable api kernel fallback to CPU one when not found");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 /**
  * CUDNN related FLAG
  * Name: FLAGS_cudnn_deterministic
@@ -322,7 +322,7 @@ PHI_DEFINE_EXPORTED_bool(
     "batch_norm, default is False.");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 /**
  * NCCL related FLAG
@@ -541,7 +541,7 @@ PHI_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
 
 /**
@@ -785,7 +785,7 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off,
  * Example:
  * Note: Check kernel launch status after every kernel compute.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DEFINE_EXPORTED_bool(
     check_kernel_launch,
     false,
@@ -800,7 +800,7 @@ PHI_DEFINE_EXPORTED_bool(
  * Example:
  * Note: Disable cudnn in conv2d.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                          false,
                          "Disable cudnn in conv2d");
@@ -819,7 +819,7 @@ PHI_DEFINE_EXPORTED_bool(use_fast_math,
  * Note: Get host by name time.
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
-    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 PHI_DEFINE_EXPORTED_int32(get_host_by_name_time,
                           120,
                           "The maximum time for get host by name time");
@@ -1190,11 +1190,11 @@ PHI_DEFINE_EXPORTED_bool(multi_node_sample_use_gpu_table,
  * Note: nccl blocking wait.
  */
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PHI_DEFINE_EXPORTED_bool(benchmark_nccl,
                          false,
                          "enable nccl debug mode to synchronize nccl comm");
@@ -1428,7 +1428,7 @@ PHI_DEFINE_EXPORTED_int32(
 
 PHI_DEFINE_EXPORTED_bool(print_ir, false, "Whether print ir debug str.");
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 /**
  * Communication library related FLAG
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index a2fe426b0ec47b..82d37be80d3c36 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -63,7 +63,7 @@ const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
 }
 
 const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
   static int64_t num_cuda_devices = -1;
   static std::once_flag num_devices_init_flag;
@@ -278,7 +278,8 @@ uint64_t Generator::Random64() {
 
 std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
     uint64_t increment_offset) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE)
   std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t cur_offset = this->state_.thread_offset;
   VLOG(10) << "cur_offset = " << cur_offset
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index 3295a2f6b37399..decebbe66a5381 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -18,10 +18,6 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef __MUSACC__
-#include <musa_runtime.h>
-#endif
-
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
@@ -30,7 +26,7 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__) || defined(__MUSACC__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 6e534511802bb9..a5c5a3994a81b1 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -124,7 +124,7 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN(
     return empty_kernel;
   }
   KernelKey kernel_key = KernelKey(const_kernel_key);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
@@ -239,7 +239,7 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
   KernelKey kernel_key = KernelKey(const_kernel_key.backend(),
                                    phi::DataLayout::ALL_LAYOUT,
                                    const_kernel_key.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
diff --git a/paddle/phi/core/kernel_registry.cc b/paddle/phi/core/kernel_registry.cc
index 77ae9b45c9d682..fa9d531b6534d6 100644
--- a/paddle/phi/core/kernel_registry.cc
+++ b/paddle/phi/core/kernel_registry.cc
@@ -34,7 +34,7 @@ void SetKernelArgsDef(const std::vector<std::type_index>& args_type,
 #if defined(PADDLE_WITH_DNNL)
         || arg_type == std::type_index(typeid(const OneDNNContext&))
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
         || arg_type == std::type_index(typeid(const GPUContext&))
 #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         || arg_type == std::type_index(typeid(const XPUContext&))
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 19f76f60f9a1ba..b24e39b6c75bf1 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -1199,7 +1199,7 @@ struct KernelRegistrar {
                                             meta_kernel_fn,        \
                                             BACKEND_LIST_EXCEPT_CUSTOM)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #define _DEVICE GPU,
 #elif defined(PADDLE_WITH_XPU)
 #define _DEVICE XPU,
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 3b55ccd3dbc365..715b4f76392d8f 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -300,7 +300,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   /* DeviceContext Helpers */
 
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc
index aba6a0f7bfca27..857bd546befcdf 100644
--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -33,7 +33,7 @@ template <typename T>
 void CopyToCPUHelper(std::vector<T> *cpu_,
                      phi::Allocator::AllocationPtr *gpu_,
                      size_t *gpu_memory_size_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // COPY GPU Data To CPU
   auto *dev_ctx = static_cast<phi::GPUContext *>(
       phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
@@ -55,7 +55,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
                              phi::Allocator::AllocationPtr *gpu_,
                              size_t *gpu_memory_size_,
                              const phi::Place &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void *src = cpu_->data();
   *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
   (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 700db5e8d4382e..35c59c2d8d787d 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -116,11 +116,9 @@ void StringTensor::init_holder() {
   if (place.GetType() == phi::AllocationType::CPU) {
     std::memset(ptr, 0, bytes_size);
   } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #ifdef PADDLE_WITH_HIP
     hipMemset(ptr, 0, bytes_size);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemset(ptr, 0, bytes_size);
 #else
     cudaMemset(ptr, 0, bytes_size);
 #endif
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index 03d8b3a0f661ee..17fdef1b9cfbdd 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -65,7 +65,7 @@ void Copy(const Context& dev_ctx,
 #ifdef PADDLE_WITH_DNNL
     dst->set_layout(src.layout());
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (dst_place.GetType() == AllocationType::GPU ||
              dst_place.GetType() == AllocationType::GPUPINNED) {
     dst_ptr = dev_ctx.Alloc(
@@ -106,7 +106,7 @@ void Copy(const Context& dev_ctx,
   if (src_place.GetType() == AllocationType::CPU &&
       dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if ((src_place.GetType() == AllocationType::CPU ||
               src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
              (dst_place.GetType() == AllocationType::CPU ||
@@ -394,7 +394,7 @@ template void Copy(const DeviceContext& dev_ctx,
                    bool blocking,
                    TensorArray* dst);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template void Copy(const GPUContext& dev_ctx,
                    const DenseTensor& src,
                    Place dst_place,
@@ -476,7 +476,7 @@ void TensorFromVector(const std::vector<T>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -530,7 +530,7 @@ void TensorFromVector(const std::vector<bool>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -622,7 +622,7 @@ void TensorFromArray(const T* src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -722,7 +722,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -764,7 +764,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index ea1caf4ac067d8..449d7cbe8966df 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -211,35 +211,34 @@ inline int TransToProtoVarType(const DataType& dtype) {
   }
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
-inline mcclDataType_t ToNCCLDataType(DataType type) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+inline ncclDataType_t ToNCCLDataType(DataType type) {
   if (type == DataType::FLOAT32) {
-    return mcclFloat;
+    return ncclFloat;
   } else if (type == DataType::FLOAT64) {
-    return mcclDouble;
+    return ncclDouble;
   } else if (type == DataType::INT32) {
-    return mcclInt;
+    return ncclInt;
   } else if (type == DataType::INT64) {
-    return mcclInt64;
+    return ncclInt64;
   } else if (type == DataType::FLOAT16) {
-    return mcclFloat16;
+    return ncclFloat16;
   } else if (type == DataType::UINT8) {
-    return mcclUint8;
+    return ncclUint8;
   } else if (type == DataType::INT8) {
-    return mcclInt8;
+    return ncclInt8;
   } else if (type == DataType::BOOL) {
-    return mcclUint8;
-  // } else if (type == DataType::BFLOAT16) {
-  //   return ncclBfloat16;
+    return ncclUint8;
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+  } else if (type == DataType::BFLOAT16) {
+    return ncclBfloat16;
+#endif
   } else {
     PADDLE_THROW(
         errors::Unimplemented("This datatype in nccl is not supported."));
   }
 }
 #endif
-
-
-
 #if defined(PADDLE_WITH_XPU_BKCL)
 inline BKCLDataType ToBKCLDataType(DataType type) {
   if (type == DataType::FLOAT32) {
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index 63c9cf63f9a320..b419338401eeac 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -54,12 +54,12 @@ template class TypeInfoTraits<phi::TensorBase, phi::distributed::DistTensor>;
 template class TypeInfoTraits<phi::DeviceContext, CPUContext>;
 template class TypeInfoTraits<phi::DeviceContext, CustomContext>;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_XPU_KP)
 template class TypeInfoTraits<phi::DeviceContext, GPUContext>;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template class TypeInfoTraits<phi::DeviceContext, GPUPinnedContext>;
 #endif
 
diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h
index 34a8fca61fbbee..6318b17647cd61 100644
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -25,7 +25,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
                                          const Visitor& visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       phi::GPUPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -35,7 +35,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       phi::GPUPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index 6c61c3964b52d6..7ee12e26d7d0ef 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -150,7 +150,7 @@ namespace phi {
 
 ///////// BOOL and Floating and Integral Dispatch Marco ///////////
 
-#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)  && !defined(PADDLE_WITH_MCCL)
+#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)
 #define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...)    \
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \
@@ -355,7 +355,7 @@ namespace phi {
                  "`");                                                        \
     }                                                                         \
   }()
-#if defined(PADDLE_WITH_XPU)
+#if defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_HIP)
 #define PD_VISIT_ALL_TYPES(TYPE, NAME, ...)                                    \
   [&] {                                                                        \
     const auto& __dtype__ = TYPE;                                              \
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index ac3eb1f3cc12fc..eee92aa1380449 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -932,7 +932,7 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
     size_of_dtype = static_cast<int>(phi::SizeOf(dtype));
   }
   if (config.is_runtime) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     int64_t numel = 0;
     for (auto item : input) {
       const auto& dim = item->dims();
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 2df3f34b57936c..f38a842a669873 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -22,9 +22,6 @@ add_subdirectory(autotune)
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 
 file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
-if(WITH_MUSA)
-  list(REMOVE_ITEM kernel_cu "sparse/*.h")
-endif()
 file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
 file(GLOB kernel_primitive_h "primitive/*.h")
 
@@ -43,43 +40,6 @@ file(
   "strings/gpu/*.cu"
   "fusion/gpu/*.cu")
 
-if(WITH_MUSA)
-  # 创建要排除的文件模式列表
-  file(
-  GLOB files_to_remove
-  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
-  "sparse/gpu/*.cu"
-  "gpudnn/*.cu")
-
-  list(REMOVE_ITEM kernel_cu ${files_to_remove})
-  message(STATUS "files_to_remove:${files_to_remove}")
-
-  list(
-    REMOVE_ITEM 
-    kernel_cu 
-    "strings/gpu/strings_lower_upper_kernel.cu" 
-    "strings/gpu/strings_copy_kernel.cu" 
-    "fusion/gpu/block_multi_head_attention_kernel.cu"
-    "gpu/cudnn_lstm_kernel.cu"
-    "gpu/cudnn_lstm_grad_kernel.cu"
-    "gpu/instance_norm_kernel.cu"
-    "gpu/instance_norm_grad_kernel.cu"
-    "gpu/log_softmax_kernel.cu"
-    "gpu/log_softmax_grad_kernel.cu"
-    "gpu/weighted_sample_neighbors_kernel.cu"
-    "gpu/cross_entropy_kernel.cu"
-    "gpu/cross_entropy_grad_kernel.cu"
-    "gpu/gelu_kernel.cu"
-    "gpu/gelu_grad_kernel.cu"
-    "gpu/rnn_kernel.cu.cc"
-    "gpu/rnn_grad_kernel.cu.cc"
-    "gpu/clip_by_norm_kernel.cu"
-    "selected_rows/gpu/clip_by_norm_kernel.cu"
-    "gpu/softmax_grad_kernel.cu"
-    "gpu/softmax_kernel.cu"
-    )
-endif()
-
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
 endif()
@@ -217,6 +177,32 @@ if(NOT WITH_CUDNN_FRONTEND)
     "fusion/gpu/fused_dconv_drelu_dbn_kernel.cu")
 endif()
 
+# Note(qili93): remove kernels not supported on DCU yet
+if(WITH_ROCM)
+  list(
+    REMOVE_ITEM
+    kernel_cu
+    "gpu/affine_grid_grad_kernel.cu"
+    "gpu/apply_per_channel_scale_kernel.cu"
+    "gpu/cholesky_solve_kernel.cu"
+    "gpu/eigh_kernel.cu"
+    "gpu/eigvalsh_kernel.cu"
+    "gpu/lstsq_kernel.cu"
+    "gpu/lu_kernel.cu"
+    "gpu/matrix_rank_kernel.cu"
+    "gpu/matrix_rank_tol_kernel.cu"
+    "gpu/multiclass_nms3_kernel.cu"
+    "gpu/put_along_axis_grad_kernel.cu"
+    "gpu/put_along_axis_kernel.cu"
+    "gpu/qr_kernel.cu"
+    "gpu/svd_kernel.cu"
+    "gpudnn/mha_cudnn_frontend.cu"
+    "fusion/gpu/block_multi_head_attention_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_grad_kernel.cu"
+    "fusion/gpu/fused_bn_add_activation_kernel.cu"
+    "fusion/gpu/fusion_transpose_flatten_concat_kernel.cu")
+endif()
+
 set(cc_search_pattern
     "*.cc"
     "cpu/*.cc"
@@ -233,16 +219,16 @@ set(cc_search_pattern
     "fusion/*.cc"
     "stride/*.cc"
     "fusion/cpu/*.cc")
-if(WITH_MUSA)
-    list(REMOVE_ITEM cc_search_pattern "sparse/*.cc")
-    list(REMOVE_ITEM cc_search_pattern "sparse/cpu/*.cc")    
-endif()
 
 if(WITH_MKLDNN)
   set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc"
                         "fusion/onednn/*.cc")
 endif()
 
+if(WITH_CUSTOM_DEVICE)
+  set(cc_search_pattern ${cc_search_pattern} "custom/*.cc")
+endif()
+
 file(
   GLOB kernel_cc
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
@@ -266,7 +252,7 @@ file(
   "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   collect_srcs(kernels_srcs SRCS ${kernel_cu})
   kernel_declare("${kernel_cu}")
 endif()
diff --git a/paddle/phi/kernels/array_kernel.cc b/paddle/phi/kernels/array_kernel.cc
index 5389a26479213a..8a599dcf9d80d8 100644
--- a/paddle/phi/kernels/array_kernel.cc
+++ b/paddle/phi/kernels/array_kernel.cc
@@ -134,7 +134,7 @@ PD_REGISTER_KERNEL(create_array,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(create_array,
                    GPU,
                    ALL_LAYOUT,
@@ -178,7 +178,7 @@ PD_REGISTER_KERNEL(array_read,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(array_read,
                    GPU,
                    ALL_LAYOUT,
@@ -208,7 +208,7 @@ PD_REGISTER_KERNEL(array_write,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(array_write,
                    GPU,
                    ALL_LAYOUT,
@@ -238,7 +238,7 @@ PD_REGISTER_KERNEL(array_to_tensor,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(array_to_tensor,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index eb884d53f3cd63..b4504f83818d77 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -139,7 +139,7 @@ PD_REGISTER_KERNEL(assign_value,
                    int8_t,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 01ba364ad3d3d5..b04c46351c2cfd 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -30,15 +30,11 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#endif
-
 namespace phi {
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-static void RecordEventTimerCallback(musaStream_t stream,
-                                     musaError_t status,
+#ifdef PADDLE_WITH_HIP
+static void RecordEventTimerCallback(hipStream_t stream,
+                                     hipError_t status,
                                      void *user_data) {
   struct timeval time_now {};
   gettimeofday(&time_now, nullptr);
@@ -64,9 +60,6 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventCreate(&start_);
     hipEventCreate(&stop_);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventCreate(&start_);
-    musaEventCreate(&stop_);    
 #else
     cudaEventCreate(&start_);
     cudaEventCreate(&stop_);
@@ -81,9 +74,6 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(start_);
     hipEventDestroy(stop_);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventDestroy(start_);
-    musaEventDestroy(stop_);
 #else
     cudaEventDestroy(start_);
     cudaEventDestroy(stop_);
@@ -93,8 +83,6 @@ class GpuTimer {
   void Start(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(start_, stream);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventRecord(start_, stream);
 #else
     cudaEventRecord(start_, stream);
 #endif
@@ -103,8 +91,6 @@ class GpuTimer {
   void Stop(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(stop_, stream);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventRecord(stop_, stream);
 #else
     cudaEventRecord(stop_, stream);
 #endif
@@ -115,9 +101,6 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventSynchronize(stop_);
     hipEventElapsedTime(&milliseconds, start_, stop_);
-#elif defined(PADDLE_WITH_MUSA)
-    musaEventSynchronize(stop_);
-    musaEventElapsedTime(&milliseconds, start_, stop_);
 #else
     cudaEventSynchronize(stop_);
     cudaEventElapsedTime(&milliseconds, start_, stop_);
@@ -161,12 +144,6 @@ class CalculateStreamTimer {
                                RecordEventTimerCallback,
                                reinterpret_cast<void *>(&start_time_),
                                0));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaStreamAddCallback(calculated_stream_,
-                               RecordEventTimerCallback,
-                               reinterpret_cast<void *>(&start_time_),
-                               0));                              
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamAddCallback(calculated_stream_,
@@ -186,12 +163,6 @@ class CalculateStreamTimer {
                                RecordEventTimerCallback,
                                reinterpret_cast<void *>(&end_time_),
                                0));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaStreamAddCallback(calculated_stream_,
-                               RecordEventTimerCallback,
-                               reinterpret_cast<void *>(&end_time_),
-                               0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamAddCallback(calculated_stream_,
@@ -207,8 +178,6 @@ class CalculateStreamTimer {
     if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_));
 #endif
@@ -220,8 +189,6 @@ class CalculateStreamTimer {
     if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_));
 #endif
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index dba08b0de366af..bf04c99dab0a3c 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
 }
 #endif
 #endif
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 9f4b51281cd37f..6e496a355302fc 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(check_memory_continue,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index 2d0ab05a8de78b..a60369af449f4e 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -309,20 +309,6 @@ PD_REGISTER_KERNEL(coalesce_tensor,
 }
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-PD_REGISTER_KERNEL(coalesce_tensor,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::CoalesceTensorKernel,
-                   phi::dtype::float16,
-                   int,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
-  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
-}
-#endif
-
 #ifdef PADDLE_WITH_XPU
 PD_REGISTER_KERNEL(coalesce_tensor,
                    XPU,
diff --git a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc
index acd84a80be2ad1..47e804b7de2775 100644
--- a/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/cum_maxmin_grad_kernel.cc
@@ -38,10 +38,10 @@ void CummaxGradKernel(const Context& dev_ctx,
   }
   if (dtype == DataType::INT32) {
     phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
@@ -61,10 +61,10 @@ void CumminGradKernel(const Context& dev_ctx,
   }
   if (dtype == DataType::INT32) {
     phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc
index 0b11e3d6f98da9..aceced1ce85313 100644
--- a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc
+++ b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc
@@ -29,4 +29,4 @@ void DecodeJpegKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {}
\ No newline at end of file
+    decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {}
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
index 81ed7170d7a24f..65ee3c1851003e 100644
--- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -64,7 +64,7 @@ struct GeluGradFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
       auto x_data = x.data();
       auto dx_data = dx.data();
       auto dout_data = dout.data();
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
index 47ab1a78390662..dbab3bd3266649 100644
--- a/paddle/phi/kernels/cpu/gelu_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -53,7 +53,7 @@ struct GeluFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
       auto x_data = x.data();
       auto out_data = out.data();
       int n = std::min(x.size(), out.size());
diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
index dd7b762849d16b..aeb2071b136de8 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc
@@ -25,11 +25,14 @@ namespace phi {
 
 template <typename T, typename Context>
 void PutAlongAxisGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x UNUSED,
+                            const DenseTensor& x,
                             const DenseTensor& index,
+                            const DenseTensor& value,
+                            const DenseTensor& out,
                             const DenseTensor& out_grad,
                             int axis,
-                            const std::string& reduce UNUSED,
+                            const std::string& reduce,
+                            bool include_self,
                             DenseTensor* x_grad,
                             DenseTensor* value_grad) {
   PADDLE_ENFORCE_EQ(
@@ -40,31 +43,135 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::cpu_scatter_input_grad_kernel<T, int32_t>(
-          // Here passing an unused argument out_grad, because it's
-          // convenient to instantiate a bunch of template function with the
-          // same arguments list.
-          out_grad,
-          axis,
-          index,
-          *x_grad,
-          dev_ctx);
-    } else {
-      phi::funcs::cpu_scatter_input_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *x_grad, dev_ctx);
+    if (include_self == false || reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_input_grad_kernel<T, int32_t>(
+            // Here passing an unused argument out_grad, because it's
+            // convenient to instantiate a bunch of template function with the
+            // same arguments list.
+            out_grad,
+            axis,
+            index,
+            *x_grad,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_mul_min_max_input_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_mean_input_grad_kernel<T, int32_t>(
+            // Here passing an unused argument out_grad, because it's
+            // convenient to instantiate a bunch of template function with the
+            // same arguments list.
+            out_grad,
+            axis,
+            index,
+            *x_grad,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_mean_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
     }
   }
 
   if (value_grad) {
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::cpu_scatter_value_grad_kernel<T, int32_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
-    } else {
-      phi::funcs::cpu_scatter_value_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
+    auto* grad_data = value_grad->data<T>();
+    int64_t grad_size = value_grad->numel();
+    memset(grad_data, 0, sizeof(T) * grad_size);
+    if (reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_value_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      } else if (index_type == DataType::INT64) {
+        phi::funcs::cpu_scatter_value_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "add" || reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_add_mean_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_add_mean_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::cpu_scatter_mul_min_max_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
     }
   }
 }
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
index 5417f9463a62f8..4411755d61cbaf 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -30,6 +30,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                         const DenseTensor& value,
                         int axis,
                         const std::string& reduce,
+                        bool include_self,
                         DenseTensor* out) {
   PADDLE_ENFORCE_EQ(
       dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
@@ -41,31 +42,56 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
       phi::funcs::cpu_scatter_add_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
       phi::funcs::cpu_scatter_mul_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::cpu_scatter_mul_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
       phi::funcs::cpu_scatter_assign_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::cpu_scatter_assign_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "mean") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::cpu_scatter_mean_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::cpu_scatter_mean_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amax") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::cpu_scatter_max_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::cpu_scatter_max_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amin") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::cpu_scatter_min_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::cpu_scatter_min_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else {
     PADDLE_THROW(errors::InvalidArgument(
         "can not support reduce: '%s' for scatter kernel, only "
-        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and "
+        "'multiply', the "
         "default reduce "
         "op is 'assign' ",
         reduce));
diff --git a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
index b7b33d4290daec..66f3ef0cd790d1 100644
--- a/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_grad_kernel.cc
@@ -104,7 +104,8 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(repeat_interleave_grad,
                    CPU,
@@ -113,4 +114,5 @@ PD_REGISTER_KERNEL(repeat_interleave_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc
index 388e243eff42a0..8b00d7e38f304c 100644
--- a/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc
+++ b/paddle/phi/kernels/cpu/repeat_interleave_kernel.cc
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    CPU,
@@ -34,4 +35,5 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
index ed35513d985505..237a892dbb356c 100644
--- a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
@@ -35,3 +35,20 @@ PD_REGISTER_KERNEL(set_value_grad,
                    phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(set_value_with_scalar_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SetValueWithScalarGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
index 8a7238203ec647..4e5fc0c305100c 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_grad_kernel.cc
@@ -50,10 +50,11 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
         axis,
         index,
         out_grad,
+        true,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == phi::DataType::INT64) {
     phi::funcs::cpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, index, out_grad, dev_ctx);
+        *x_grad, axis, index, out_grad, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
index d1b4a24b54eba5..d006f688ae2434 100644
--- a/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/take_along_axis_kernel.cc
@@ -38,9 +38,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    phi::funcs::cpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::cpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, true, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    phi::funcs::cpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::cpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc
new file mode 100644
index 00000000000000..ff61688513b139
--- /dev/null
+++ b/paddle/phi/kernels/custom/c_embedding_grad_kernel.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/c_embedding_grad_kernel.h"
+#include "glog/logging.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <typename T, typename Context>
+void CEmbeddingGradKernel(const Context& dev_ctx,
+                          const DenseTensor& w,
+                          const DenseTensor& ids,
+                          const DenseTensor& out_grad,
+                          int64_t start_index,
+                          DenseTensor* w_grad) {
+  w_grad->Resize(w.dims());
+  dev_ctx.template Alloc(w_grad, w.dtype());
+  const auto& index_type = ids.dtype();
+  if (index_type == phi::DataType::INT32 ||
+      index_type == phi::DataType::INT64) {
+    auto K = ids.numel();
+    auto N = w.dims()[0];
+    auto D = w.dims()[1];
+
+    auto x_tmp = std::make_shared<phi::DenseTensor>();
+    x_tmp->ShareDataWith(ids).Resize({K});
+    auto w_tmp = std::make_shared<phi::DenseTensor>();
+    w_tmp->set_meta(w.meta());
+    dev_ctx.Alloc(w_tmp.get(), w_tmp->dtype());
+    auto out_grad_tmp = std::make_shared<phi::DenseTensor>();
+    out_grad_tmp->ShareDataWith(out_grad).Resize({K, D});
+    paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp),
+        out_grad_tensor(out_grad_tmp);
+
+    auto start_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index, x_tensor.dtype(), x_tensor.place());
+    auto end_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place());
+    auto ids_mask_tensor = paddle::experimental::logical_and(
+        x_tensor.greater_equal(start_index_tensor),
+        x_tensor.less_than(end_index_tensor));
+    auto real_ids_tensor = (x_tensor - start_index_tensor)
+                               .multiply(paddle::experimental::cast(
+                                   ids_mask_tensor, x_tensor.dtype()));
+    auto out_grad_tensor_mul_mask =
+        paddle::experimental::reshape(out_grad_tensor, {K, D})
+            .multiply(paddle::experimental::reshape(
+                paddle::experimental::cast(ids_mask_tensor, w.dtype()),
+                {K, 1}));
+    paddle::Tensor w_grad_tensor;
+    paddle::experimental::embedding_grad(real_ids_tensor,
+                                         w_tensor,
+                                         out_grad_tensor_mul_mask,
+                                         -1,
+                                         false,
+                                         &w_grad_tensor);
+    w_grad->ShareDataWith(
+        *reinterpret_cast<phi::DenseTensor*>(w_grad_tensor.impl().get()));
+
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Custom Device c_embedding_grad ids only support int32 or int64."));
+  }
+}
+#endif
+}  // namespace phi
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+PD_REGISTER_KERNEL(c_embedding_grad,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CEmbeddingGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/custom/c_embedding_kernel.cc b/paddle/phi/kernels/custom/c_embedding_kernel.cc
new file mode 100644
index 00000000000000..0cacf61d46f3a8
--- /dev/null
+++ b/paddle/phi/kernels/custom/c_embedding_kernel.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/c_embedding_kernel.h"
+#include "glog/logging.h"
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <typename T, typename Context>
+void CEmbeddingKernel(const Context& dev_ctx,
+                      const DenseTensor& w,
+                      const DenseTensor& ids,
+                      int64_t start_index,
+                      int64_t vocab_size,
+                      DenseTensor* out) {
+  const auto& index_type = ids.dtype();
+  if (index_type == phi::DataType::INT32 ||
+      index_type == phi::DataType::INT64) {
+    auto out_dims = out->dims();
+    auto K = ids.numel();
+    auto N = w.dims()[0];
+    auto D = w.dims()[1];
+
+    auto x_tmp = std::make_shared<phi::DenseTensor>();
+    x_tmp->ShareDataWith(ids).Resize({K});
+    auto w_tmp = std::make_shared<phi::DenseTensor>();
+    w_tmp->ShareDataWith(w).Resize({N, D});
+    paddle::Tensor x_tensor(x_tmp), w_tensor(w_tmp);
+
+    auto start_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index, x_tensor.dtype(), x_tensor.place());
+    auto end_index_tensor = paddle::experimental::full_like(
+        x_tensor, start_index + N, x_tensor.dtype(), x_tensor.place());
+    auto ids_mask_tensor = paddle::experimental::logical_and(
+        x_tensor.greater_equal(start_index_tensor),
+        x_tensor.less_than(end_index_tensor));
+    auto ids_tensor = (x_tensor - start_index_tensor)
+                          .multiply(paddle::experimental::cast(
+                              ids_mask_tensor, x_tensor.dtype()));
+    auto out_tensor =
+        paddle::experimental::reshape(
+            paddle::experimental::cast(ids_mask_tensor, w_tensor.dtype()),
+            {K, 1})
+            .multiply(paddle::experimental::reshape(
+                paddle::experimental::embedding(
+                    ids_tensor, w_tensor, -1, false),
+                {K, D}));
+    out->ShareDataWith(
+           *reinterpret_cast<phi::DenseTensor*>(out_tensor.impl().get()))
+        .Resize(out_dims);
+  } else {
+    PADDLE_THROW(phi::errors::Unavailable(
+        "Custom Device c_embedding ids only support int32 or int64."));
+  }
+}
+#endif
+}  // namespace phi
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+PD_REGISTER_KERNEL(c_embedding,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CEmbeddingKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index 170f9a3a4d6082..088a4fe4ffd266 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(dist_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 60fc5236abc940..d2391a5702d4b1 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 2b7c400bc64641..ebe1b1d24e50a5 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index 6b22ac75181791..dc61e6a650efa1 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(flatten_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index 1886f5af4c1cb7..cd603dd57e64d1 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    bool) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(full_batch_size_like,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index f2d43a19a246d6..d124e269e5c007 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,16 +8,16 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
+if(WITH_GPU OR WITH_ROCM)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "*.cu")
 endif()
 
-if(WITH_MUSA)
-  list(REMOVE_ITEM func_cu_srcs 
-      "softmax.cu")
+# Note(qili93): remove kernels not supported on DCU yet
+if(WITH_ROCM)
+  list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu")
 endif()
 
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index dcad9755ee4e05..06b59644cf11d4 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -3013,7 +3013,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 template <typename T>
 struct CudaLogitFunctor : public BaseActivationFunctor<T> {
diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h
index cab4d32a998268..5f66f6f1abd4d2 100644
--- a/paddle/phi/kernels/funcs/algorithm.h
+++ b/paddle/phi/kernels/funcs/algorithm.h
@@ -40,7 +40,7 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)  // @{ Group LowerBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group LowerBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/lower_bound
   auto *first = x;
@@ -63,7 +63,7 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)  // @{ Group UpperBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group UpperBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/upper_bound
   auto *first = x;
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index 69e13d29874d51..140eca890480f9 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -175,7 +175,7 @@ class Blas {
              T* c,
              const int* ldc) const;
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   template <typename T>
   void MatMulWithHead(const phi::DenseTensor& mat_a,
                       const MatDescriptor& dim_a,
@@ -303,7 +303,7 @@ class Blas {
                    int batchCount) const;
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
                            CBLAS_TRANSPOSE transB,
@@ -360,7 +360,7 @@ class Blas {
             T* B,
             int ldb) const;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   template <typename T>
   void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
 
@@ -445,7 +445,7 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template CSRMM<T>(args...);
   }
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   template <typename... ARGS>
   void MatMulWithHead(ARGS... args) const {
     Base()->template MatMulWithHead<T>(args...);
@@ -543,7 +543,7 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template TRSM<T>(args...);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   template <typename... ARGS>
   void BatchedGETRF(ARGS... args) const {
     Base()->template BatchedGETRF<T>(args...);
@@ -593,7 +593,3 @@ inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/kernels/funcs/blas/blas_impl.mu.h"
-#endif
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index a4233d9a4147ac..ffafe15b8fcf2d 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1451,7 +1451,7 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif
 }
 
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_MUSA) && \
+#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
     !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
@@ -1698,7 +1698,7 @@ void Blas<DeviceContext>::MatMul(const T *mat_a,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
+    !defined(PADDLE_WITH_HIP)
 // @{ Group Blas MKLML: MatMulWithHead
 /*
  * Multiple two matrixes with multiple heads
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
deleted file mode 100644
index c6391acab6d894..00000000000000
--- a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
+++ /dev/null
@@ -1,1602 +0,0 @@
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined(__MUSACC__)
-#include <thrust/device_vector.h>
-#endif
-#include "glog/logging.h"
-#include "paddle/utils/flags.h"
-
-#include "paddle/phi/backends/dynload/mublas.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/flags.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-PHI_DECLARE_bool(enable_cublas_tensor_op_math);
-PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
-
-namespace phi {
-namespace funcs {
-
-template <typename T>
-struct CUBlas;
-
-template <>
-struct CUBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSscal(args...));
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasScopy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mublasSgemmStridedBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      mublasOperation_t transa,
-                      mublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const float *alpha,
-                      const void *A,
-                      musaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      musaDataType_t Btype,
-                      int ldb,
-                      const float *beta,
-                      void *C,
-                      musaDataType_t Ctype,
-                      int ldc) {
-// Because the gcc 4.8 doesn't expand template parameter pack that
-// appears in a lambda-expression, I can not use template parameter pack
-// here.
-    // VLOG(5) << "use_tensor_op_math: "
-    //         << (dev_ctx->tensor_core_available() ? "True" : "False");
-    // dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
-    //   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmEx(handle,
-    //                                                          transa,
-    //                                                          transb,
-    //                                                          m,
-    //                                                          n,
-    //                                                          k,
-    //                                                          alpha,
-    //                                                          A,
-    //                                                          Atype,
-    //                                                          lda,
-    //                                                          B,
-    //                                                          Btype,
-    //                                                          ldb,
-    //                                                          beta,
-    //                                                          C,
-    //                                                          Ctype,
-    //                                                          ldc));
-    // });
-      PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasSgemmEx."));
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsm(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRF_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrfBatched(args...));
-    PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasSgetrfBatched."));
-  }
-
-  template <typename... ARGS>
-  static void GETRI_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetriBatched(args...));
-    PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasSgetriBatched."));
-  }
-
-  template <typename... ARGS>
-  static void MATINV_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSmatinvBatched(args...));
-    PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasSmatinvBatched."));
-  }
-
-  template <typename... ARGS>
-  static void GETRS_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrsBatched(args...));
-      PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasSgetrsBatched."));
-  }
-
-  template <typename... ARGS>
-  static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsmBatched(args...));
-  }
-};
-
-template <>
-struct CUBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDscal(args...));
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDcopy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemmBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mublasDgemmStridedBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args UNUSED) {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasDgemmEx."));
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsm(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRF_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrfBatched(args...));
-     PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasDgetrfBatched."));   
-  }
-
-  template <typename... ARGS>
-  static void GETRI_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetriBatched(args...));
-     PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasDgetriBatched."));       
-  }
-
-  template <typename... ARGS>
-  static void MATINV_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDmatinvBatched(args...));
-     PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasDmatinvBatched."));       
-  }
-
-  template <typename... ARGS>
-  static void GETRS_BATCH(ARGS... args) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrsBatched(args...));
-     PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasDgetrsBatched."));        
-  }
-
-  template <typename... ARGS>
-  static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsmBatched(args...));
-  }
-};
-
-template <>
-struct CUBlas<phi::dtype::float16> {
-  using float16 = phi::dtype::float16;
-
-  static void GEMM(mublasHandle_t handle,
-                   mublasOperation_t transa,
-                   mublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float16 *alpha,
-                   const float16 *A,
-                   int lda,
-                   const float16 *B,
-                   int ldb,
-                   const float16 *beta,
-                   float16 *C,
-                   int ldc) {
-    // PADDLE_ENFORCE_GPU_SUCCESS(
-    //     phi::dynload::mublasHgemm(handle,
-    //                               transa,
-    //                               transb,
-    //                               m,
-    //                               n,
-    //                               k,
-    //                               reinterpret_cast<const __half *>(alpha),
-    //                               reinterpret_cast<const __half *>(A),
-    //                               lda,
-    //                               reinterpret_cast<const __half *>(B),
-    //                               ldb,
-    //                               reinterpret_cast<const __half *>(beta),
-    //                               reinterpret_cast<__half *>(C),
-    //                               ldc));
-     PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasHgemm."));        
-  }
-
-  static void GEMM_BATCH(phi::GPUContext *dev_ctx,
-                         mublasOperation_t transa,
-                         mublasOperation_t transb,
-                         int m,
-                         int n,
-                         int k,
-                         const float *alpha,
-                         const float16 **A,
-                         musaDataType_t Atype,
-                         int lda,
-                         const float16 **B,
-                         musaDataType_t Btype,
-                         int ldb,
-                         const float *beta,
-                         float16 **C,
-                         musaDataType_t Ctype,
-                         int ldc,
-                         int batchCount,
-                         musaDataType_t computeType) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "mublasGemmBatchedEx is not supported"));
-  }
-
-  static void GEMM_STRIDED_BATCH(mublasHandle_t handle,
-                                 mublasOperation_t transa,
-                                 mublasOperation_t transb,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const float16 *alpha,
-                                 const float16 *A,
-                                 int lda,
-                                 long long int strideA,  // NOLINT
-                                 const float16 *B,       // NOLINT
-                                 int ldb,
-                                 long long int strideB,  // NOLINT
-                                 const float16 *beta,
-                                 float16 *C,
-                                 int ldc,
-                                 long long int strideC,  // NOLINT
-                                 int batchCount) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "mublasHgemmStridedBatched is not supported"));                                  
-    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasHgemmStridedBatched(
-    //     handle,
-    //     transa,
-    //     transb,
-    //     m,
-    //     n,
-    //     k,
-    //     reinterpret_cast<const __half *>(alpha),
-    //     reinterpret_cast<const __half *>(A),
-    //     lda,
-    //     strideA,
-    //     reinterpret_cast<const __half *>(B),
-    //     ldb,
-    //     strideB,
-    //     reinterpret_cast<const __half *>(beta),
-    //     reinterpret_cast<__half *>(C),
-    //     ldc,
-    //     strideC,
-    //     batchCount));
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      mublasOperation_t transa,
-                      mublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      musaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      musaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      musaDataType_t Ctype,
-                      int ldc,
-                      musaDataType_t computeType) {
-    mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-  }
-};
-
-template <>
-struct CUBlas<phi::dtype::complex<float>> {
-  static void GEMV(mublasHandle_t handle,
-                   mublasOperation_t transa,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   const phi::dtype::complex<float> *beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemv(
-        handle,
-        transa,
-        m,
-        n,
-        reinterpret_cast<const muFloatComplex *>(alpha),
-        reinterpret_cast<const muFloatComplex *>(A),
-        lda,
-        reinterpret_cast<const muFloatComplex *>(B),
-        ldb,
-        reinterpret_cast<const muFloatComplex *>(beta),
-        reinterpret_cast<muFloatComplex *>(C),
-        ldc));
-  }
-
-  static void AXPY(mublasHandle_t handle,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCaxpy(
-        handle,
-        n,
-        reinterpret_cast<const muFloatComplex *>(alpha),
-        reinterpret_cast<const muFloatComplex *>(X),
-        incX,
-        reinterpret_cast<muFloatComplex *>(Y),
-        incY));
-  }
-
-  static void GEMM_STRIDED_BATCH(mublasHandle_t handle,
-                                 mublasOperation_t transa,
-                                 mublasOperation_t transb,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const phi::dtype::complex<float> *alpha,
-                                 const phi::dtype::complex<float> *A,
-                                 int lda,
-                                 long long int strideA,                // NOLINT
-                                 const phi::dtype::complex<float> *B,  // NOLINT
-                                 int ldb,
-                                 long long int strideB,  // NOLINT
-                                 const phi::dtype::complex<float> *beta,
-                                 phi::dtype::complex<float> *C,
-                                 int ldc,
-                                 long long int strideC,  // NOLINT
-                                 int batchCount) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const muFloatComplex *>(alpha),
-        reinterpret_cast<const muFloatComplex *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const muFloatComplex *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const muFloatComplex *>(beta),
-        reinterpret_cast<muFloatComplex *>(C),
-        ldc,
-        strideC,
-        batchCount));
-  }
-
-  static void GEMM(mublasHandle_t handle,
-                   mublasOperation_t transa,
-                   mublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   const phi::dtype::complex<float> *beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemm(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const muFloatComplex *>(alpha),
-        reinterpret_cast<const muFloatComplex *>(A),
-        lda,
-        reinterpret_cast<const muFloatComplex *>(B),
-        ldb,
-        reinterpret_cast<const muFloatComplex *>(beta),
-        reinterpret_cast<muFloatComplex *>(C),
-        ldc));
-  }
-
-  static void TRSM(mublasHandle_t handle,
-                   mublasSideMode_t side,
-                   mublasFillMode_t uplo,
-                   mublasOperation_t transa,
-                   mublasDiagType_t diag,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   phi::dtype::complex<float> *B,
-                   int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsm(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const muFloatComplex *>(alpha),
-        reinterpret_cast<const muFloatComplex *>(A),
-        lda,
-        reinterpret_cast<muFloatComplex *>(B),
-        ldb));
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/muda/mublas/index.html#mublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      mublasOperation_t transa,
-                      mublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      musaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      musaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      musaDataType_t Ctype,
-                      int ldc,
-                      musaDataType_t computeType) {
-    mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-  }
-
-  static void TRSM_BATCH(mublasHandle_t handle,
-                         mublasSideMode_t side,
-                         mublasFillMode_t uplo,
-                         mublasOperation_t transa,
-                         mublasDiagType_t diag,
-                         int m,
-                         int n,
-                         const phi::dtype::complex<float> *alpha,
-                         const phi::dtype::complex<float> **A,
-                         int lda,
-                         phi::dtype::complex<float> **B,
-                         int ldb,
-                         int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsmBatched(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const muFloatComplex *>(alpha),
-        reinterpret_cast<const muFloatComplex **>(A),
-        lda,
-        reinterpret_cast<muFloatComplex **>(B),
-        ldb,
-        batch_size));
-  }
-};
-
-template <>
-struct CUBlas<phi::dtype::complex<double>> {
-  static void GEMV(mublasHandle_t handle,
-                   mublasOperation_t transa,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   const phi::dtype::complex<double> *beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemv(
-        handle,
-        transa,
-        m,
-        n,
-        reinterpret_cast<const muDoubleComplex *>(alpha),
-        reinterpret_cast<const muDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<const muDoubleComplex *>(B),
-        ldb,
-        reinterpret_cast<const muDoubleComplex *>(beta),
-        reinterpret_cast<muDoubleComplex *>(C),
-        ldc));
-  }
-
-  static void AXPY(mublasHandle_t handle,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZaxpy(
-        handle,
-        n,
-        reinterpret_cast<const muDoubleComplex *>(alpha),
-        reinterpret_cast<const muDoubleComplex *>(X),
-        incX,
-        reinterpret_cast<muDoubleComplex *>(Y),
-        incY));
-  }
-
-  static void GEMM_STRIDED_BATCH(
-      mublasHandle_t handle,
-      mublasOperation_t transa,
-      mublasOperation_t transb,
-      int m,
-      int n,
-      int k,
-      const phi::dtype::complex<double> *alpha,
-      const phi::dtype::complex<double> *A,
-      int lda,
-      long long int strideA,                 // NOLINT
-      const phi::dtype::complex<double> *B,  // NOLINT
-      int ldb,
-      long long int strideB,  // NOLINT
-      const phi::dtype::complex<double> *beta,
-      phi::dtype::complex<double> *C,
-      int ldc,
-      long long int strideC,  // NOLINT
-      int batchCount) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const muDoubleComplex *>(alpha),
-        reinterpret_cast<const muDoubleComplex *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const muDoubleComplex *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const muDoubleComplex *>(beta),
-        reinterpret_cast<muDoubleComplex *>(C),
-        ldc,
-        strideC,
-        batchCount));
-  }
-
-  static void GEMM(mublasHandle_t handle,
-                   mublasOperation_t transa,
-                   mublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   const phi::dtype::complex<double> *beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemm(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const muDoubleComplex *>(alpha),
-        reinterpret_cast<const muDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<const muDoubleComplex *>(B),
-        ldb,
-        reinterpret_cast<const muDoubleComplex *>(beta),
-        reinterpret_cast<muDoubleComplex *>(C),
-        ldc));
-  }
-
-  static void TRSM(mublasHandle_t handle,
-                   mublasSideMode_t side,
-                   mublasFillMode_t uplo,
-                   mublasOperation_t transa,
-                   mublasDiagType_t diag,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   phi::dtype::complex<double> *B,
-                   int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsm(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const muDoubleComplex *>(alpha),
-        reinterpret_cast<const muDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<muDoubleComplex *>(B),
-        ldb));
-  }
-
-  static void TRSM_BATCH(mublasHandle_t handle,
-                         mublasSideMode_t side,
-                         mublasFillMode_t uplo,
-                         mublasOperation_t transa,
-                         mublasDiagType_t diag,
-                         int m,
-                         int n,
-                         const phi::dtype::complex<double> *alpha,
-                         const phi::dtype::complex<double> **A,
-                         int lda,
-                         phi::dtype::complex<double> **B,
-                         int ldb,
-                         int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsmBatched(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const muDoubleComplex *>(alpha),
-        reinterpret_cast<const muDoubleComplex **>(A),
-        lda,
-        reinterpret_cast<muDoubleComplex **>(B),
-        ldb,
-        batch_size));
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      mublasOperation_t transa,
-                      mublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      musaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      musaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      musaDataType_t Ctype,
-                      int ldc,
-                      musaDataType_t computeType) {
-    mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-
-  }
-};
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  context_.CublasCall([&](mublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle,
-                      cuTransB,
-                      cuTransA,
-                      N,
-                      M,
-                      K,
-                      &alpha,
-                      B,
-                      ldb,
-                      A,
-                      lda,
-                      &beta,
-                      C,
-                      N);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-  // // Note that cublas follows fortran order, so the order is different from
-  // // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
-  CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       MUSA_R_16F,
-                                       ldb,
-                                       A,
-                                       MUSA_R_16F,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       MUSA_R_16F,
-                                       N,
-                                       (musaDataType_t)0);//MUSA_R_32F https://jira.mthreads.com/browse/SW-37038
-}
-
-
-
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-                                            PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::complex<float> alpha,
-                                        const phi::dtype::complex<float> *A,
-                                        const phi::dtype::complex<float> *B,
-                                        phi::dtype::complex<float> beta,
-                                        phi::dtype::complex<float> *C) const {  
-                                          PADDLE_THROW(phi::errors::Unimplemented(
-      "Blas::GEMM for dtype complex<double> is not supported on MUSA now!"));
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::complex<double> alpha,
-                                        const phi::dtype::complex<double> *A,
-                                        const phi::dtype::complex<double> *B,
-                                        phi::dtype::complex<double> beta,
-                                        phi::dtype::complex<double> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-
-  thrust::complex<double> c_alpha =
-      thrust::complex<double>(alpha.real, alpha.imag);
-  thrust::complex<double> c_beta =
-      thrust::complex<double>(beta.real, beta.imag);
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
-  CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
-                                               cuTransB,
-                                               cuTransA,
-                                               N,
-                                               M,
-                                               K,
-                                               &c_alpha,
-                                               B,
-                                               // Originally, this was MUSA_C_64F, but due to some bugs, it was necessary to manually specify a value
-                                               // jira:https://jira.mthreads.com/browse/SW-37038
-                                               (musaDataType_t)5,//MUSA_C_64F
-                                               ldb,
-                                               A,
-                                               (musaDataType_t)5,//MUSA_C_64F
-                                               lda,
-                                               &c_beta,
-                                               C,
-                                               (musaDataType_t)5,//MUSA_C_64F
-                                               N,
-                                               (musaDataType_t)5);//MUSA_C_64F
-}
-
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMM(bool transA,
-                                 bool transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 const T *B,
-                                 int ldb,
-                                 T beta,
-                                 T *C,
-                                 int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N;
-  mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N;
-
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
-    CUBlas<T>::GEMM_EX(&cuda_ctx,
-                       cuTransB,
-                       cuTransA,
-                       N,
-                       M,
-                       K,
-                       &alpha,
-                       B,
-                       (musaDataType_t)0,//MUSA_R_32F,
-                       ldb,
-                       A,
-                       (musaDataType_t)0,//MUSA_R_32F,
-                       lda,
-                       &beta,
-                       C,
-                       (musaDataType_t)0,//MUSA_R_32F,
-                       ldc);
-  } else {
-    context_.CublasCall([&](mublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle,
-                      cuTransB,
-                      cuTransA,
-                      N,
-                      M,
-                      K,
-                      &alpha,
-                      B,
-                      ldb,
-                      A,
-                      lda,
-                      &beta,
-                      C,
-                      ldc);
-    });
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(bool transA,
-                                        bool transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        int lda,
-                                        const phi::dtype::float16 *B,
-                                        int ldb,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C,
-                                        int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N;
-  mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N;
-
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &alpha,
-                                      B,
-                                      ldb,
-                                      A,
-                                      lda,
-                                      &beta,
-                                      C,
-                                      ldc);
-  });
-}
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(bool transA,
-                                        bool transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        int lda,
-                                        const phi::dtype::bfloat16 *B,
-                                        int ldb,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C,
-                                        int ldc) const {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Blas::GEMM for dtype bfloat16 is not supported on MUSA now!"));
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
-  context_.CublasCall(
-      [&](mublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-  context_.CublasCall(
-      [&](mublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  mublasOperation_t cuTransA = !trans_a ? MUBLAS_OP_T : MUBLAS_OP_N;
-
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
-  // it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T *A,
-                                        const T *B,
-                                        T beta,
-                                        T *C,
-                                        int batchCount,
-                                        int64_t strideA,
-                                        int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  const int64_t strideC = M * N;
-    context_.CublasCall([&](mublasHandle_t handle) {
-      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                    cuTransB,
-                                    cuTransA,
-                                    N,
-                                    M,
-                                    K,
-                                    &alpha,
-                                    B,
-                                    ldb,
-                                    strideB,
-                                    A,
-                                    lda,
-                                    strideA,
-                                    &beta,
-                                    C,
-                                    ldc,
-                                    strideC,
-                                    batchCount);
-    });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               phi::dtype::bfloat16 alpha,
-                                               const phi::dtype::bfloat16 *A,
-                                               const phi::dtype::bfloat16 *B,
-                                               phi::dtype::bfloat16 beta,
-                                               phi::dtype::bfloat16 *C,
-                                               int batchCount,
-                                               int64_t strideA,
-                                               int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  // int lda = (transA == CblasNoTrans) ? K : M;
-  // int ldb = (transB == CblasNoTrans) ? N : K;
-  // int ldc = N;
-  // mublasOperation_t cuTransA =
-  //     (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  // mublasOperation_t cuTransB =
-  //     (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  // const int64_t strideC = M * N;
-
-  // float h_alpha = static_cast<float>(alpha);
-  // float h_beta = static_cast<float>(beta);
-
-  // mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
-  // bool use_tensor_op_math = context_.tensor_core_available();
-  // if (use_tensor_op_math) {
-  //   algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
-  // }
-  // VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-
-  // context_.TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
-  //   PADDLE_ENFORCE_GPU_SUCCESS(
-  //       phi::dynload::mublasGemmStridedBatchedEx(handle,
-  //                                                cuTransB,
-  //                                                cuTransA,
-  //                                                N,
-  //                                                M,
-  //                                                K,
-  //                                                &h_alpha,
-  //                                                B,
-  //                                                MUSA_R_16BF,
-  //                                                ldb,
-  //                                                strideB,
-  //                                                A,
-  //                                                MUSA_R_16BF,
-  //                                                lda,
-  //                                                strideA,
-  //                                                &h_beta,
-  //                                                C,
-  //                                                MUSA_R_16BF,
-  //                                                ldc,
-  //                                                strideC,
-  //                                                batchCount,
-  //                                                MUBLAS_COMPUTE_32F,
-  //                                                algo));
-  // });
-       PADDLE_THROW(
-        phi::errors::Unimplemented("murrently there are not mublasGemmStridedBatchedEx."));   
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T **A,
-                                        const T **B,
-                                        T beta,
-                                        T **C,
-                                        int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
-
-#if defined(__MUSACC__)
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               double alpha,
-                                               const double **A,
-                                               const double **B,
-                                               double beta,
-                                               double **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  thrust::device_vector<const double *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const double *> B_ptr(B, B + batchCount);
-  thrust::device_vector<double *> C_ptr(C, C + batchCount);
-
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<double>::GEMM_BATCH(handle,
-                               cuTransB,
-                               cuTransA,
-                               N,
-                               M,
-                               K,
-                               &alpha,
-                               B_ptr.data().get(),
-                               ldb,
-                               A_ptr.data().get(),
-                               lda,
-                               &beta,
-                               C_ptr.data().get(),
-                               ldc,
-                               batchCount);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               float alpha,
-                                               const float **A,
-                                               const float **B,
-                                               float beta,
-                                               float **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  thrust::device_vector<const float *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const float *> B_ptr(B, B + batchCount);
-  thrust::device_vector<float *> C_ptr(C, C + batchCount);
-
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<float>::GEMM_BATCH(handle,
-                              cuTransB,
-                              cuTransA,
-                              N,
-                              M,
-                              K,
-                              &alpha,
-                              B_ptr.data().get(),
-                              ldb,
-                              A_ptr.data().get(),
-                              lda,
-                              &beta,
-                              C_ptr.data().get(),
-                              ldc,
-                              batchCount);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               phi::dtype::float16 alpha,
-                                               const phi::dtype::float16 **A,
-                                               const phi::dtype::float16 **B,
-                                               phi::dtype::float16 beta,
-                                               phi::dtype::float16 **C,
-                                               int batchCount) const {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Blas::BatchedGEMM for dtype float16 is not supported on MUSA now!"));
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               phi::dtype::bfloat16 alpha,
-                                               const phi::dtype::bfloat16 **A,
-                                               const phi::dtype::bfloat16 **B,
-                                               phi::dtype::bfloat16 beta,
-                                               phi::dtype::bfloat16 **C,
-                                               int batchCount) const {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "Blas::BatchedGEMM for bfloat16 is not supported on MUSA now!"));
-}
-#endif
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
-                                 CBLAS_UPLO uplo,
-                                 CBLAS_TRANSPOSE transA,
-                                 CBLAS_DIAG diag,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 T *B,
-                                 int ldb) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  mublasSideMode_t cuSide =
-      (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT;
-  mublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT;
-
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::TRSM(
-        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRF(
-    int n, T **a, int *ipiv, int *info, int batch_size) const {
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRI(int n,
-                                         const T **a,
-                                         const int *ipiv,
-                                         T **a_inv,
-                                         int *info,
-                                         int batch_size) const {
-  PADDLE_ENFORCE_NE(
-      a_inv,
-      a,
-      phi::errors::InvalidArgument(
-          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
-          "in-place. The memory space of output matrix (address: %p) cannot "
-          "overlap memory space of input matrix (address: %p).",
-          a_inv,
-          a));
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedMatInv(
-    int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
-                                         int n,
-                                         int nrhs,
-                                         const T **a,
-                                         int lda,
-                                         int *ipiv,
-                                         T **b,
-                                         int ldb,
-                                         int *info,
-                                         int batch_size) const {
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  mublasOperation_t cuTrans =
-      (trans == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::GETRS_BATCH(
-        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
-                                        CBLAS_UPLO uplo,
-                                        CBLAS_TRANSPOSE transA,
-                                        CBLAS_DIAG diag,
-                                        int M,
-                                        int N,
-                                        T alpha,
-                                        const T **A,
-                                        int lda,
-                                        T **B,
-                                        int ldb,
-                                        int batch_size) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  mublasSideMode_t cuSide =
-      (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT;
-  mublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  mublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
-  mublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT;
-
-  context_.CublasCall([&](mublasHandle_t handle) {
-    CUBlas<T>::TRSM_BATCH(handle,
-                          cuSide,
-                          cuUplo,
-                          cuTransA,
-                          cuDiag,
-                          N,
-                          M,
-                          &alpha,
-                          A,
-                          lda,
-                          B,
-                          ldb,
-                          batch_size);
-  });
-}
-
-}  // namespace funcs
-}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index c25ab4b55cb53d..822801e10c357c 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <sstream>
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 #include "paddle/phi/kernels/funcs/dims_simplifier.h"
 
 namespace kps = phi::kps;
@@ -27,7 +27,7 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 enum BroadcastType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 
diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h
index 6d426d764e2214..76adc40c4f9f95 100644
--- a/paddle/phi/kernels/funcs/check_numerics_utils.h
+++ b/paddle/phi/kernels/funcs/check_numerics_utils.h
@@ -86,7 +86,7 @@ HOSTDEVICE static void PrintAndThrowError(const char* debug_info,
                                           int64_t num_nan,
                                           int64_t num_inf,
                                           int64_t num_zero) {
-#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) && !defined(__MUSACC__)
+#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__)
   PADDLE_THROW(phi::errors::PreconditionNotMet(
       "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
       "%s.",
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 877bd056ac5426..f2b7de681bcfce 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -49,7 +49,7 @@ static inline void GetBlockDims(const phi::GPUContext& context,
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
index 2d210f32009370..e6d587a61e11a7 100644
--- a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
@@ -29,7 +29,7 @@ template <typename T,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = phi::EigenVector<T, MajorType, IndexType>;
 
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___)  // @{ Group for GRU CPU
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group for GRU CPU
 template <class OpResetOutput, typename T>
 void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
                                        T *gate_value,
diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index d0f714831549bc..b491cbe120d06f 100644
--- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -144,7 +144,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
       }
 
       for (int i = 0; i < Tiled_size; ++i) {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
         c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
 #else
         c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
@@ -206,7 +206,7 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
       }
 
       for (int i = 0; i < Tiled_size; ++i) {
-#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__)
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
         c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
 #else
         c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
diff --git a/paddle/phi/kernels/funcs/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h
index f5a16ade4fd23d..9e2aef19406191 100644
--- a/paddle/phi/kernels/funcs/detail/gru_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h
@@ -44,7 +44,7 @@ class gru_resetOutput {
           (*value_reset_output + *value_reset_bias) * (*value_reset_gate);
     }
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU reset output
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU reset output
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -90,7 +90,7 @@ class gru_finalOutput {
                       ((*value_update_gate) * (*value_frame_state));
     }
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___)// @{ Group GRU final output
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU final output
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -150,7 +150,7 @@ class gru_stateGrad {
           *grad_output * (*value_update_gate), *value_frame_state, act_input);
     }
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU state grad
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU state grad
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -211,7 +211,7 @@ class gru_resetGrad {
     *grad_reset_gate =
         activation(*grad_reset_gate, *value_reset_gate, act_gate);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU reset grad
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU reset grad
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -265,7 +265,7 @@ class gru {
         reset_output * (*grad_frame_state), *value_reset_gate, act_gate);
     *grad_reset_output = (*value_reset_gate) * (*grad_frame_state);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU CPU
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU CPU
 #ifndef __AVX__
   static const bool avx = false;
 #else
diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
index b0702d560fa518..e8b8e957c80d1c 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
@@ -36,7 +36,7 @@ template <typename T,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = phi::EigenVector<T, MajorType, IndexType>;
 
-#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group LSTM CPU
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM CPU
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op,
diff --git a/paddle/phi/kernels/funcs/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
index 264322521d477f..0846f05a0c2c53 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
@@ -59,7 +59,7 @@ class lstm {
     *state_atv = activation(*state, active_state);
     *output = (*value_og) * (*state_atv);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group LSTM FWD
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM FWD
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
   static const bool avx = false;
 #else
@@ -163,7 +163,7 @@ class lstm {
     *checkFGrad = (*grad_fg) * (*prev_state);
     *checkOGrad = (*grad_og) * (*state);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group LSTM BWD
+#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM BWD
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
   static const bool avx = false;
 #else
diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
index 555b1d3fb250e0..03e3bdde05ad09 100644
--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 
@@ -41,7 +41,7 @@ struct StridedMemcpyFunctor<T, 0> {
       auto& cpu_place = place;
       memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(
@@ -68,7 +68,7 @@ struct StridedMemcpyFunctor<T, 1> {
       memory_utils::Copy(
           cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(gpu_place,
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index 6f4e5fceec4739..5504a337e88f2e 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
@@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context,
 
     int64_t pos = std::abs(offset) * offset_stride;
     int64_t dim_size = ret_strides.size();
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     thrust::device_vector<int64_t> diag_vec(common::vectorize(dig_stride));
     const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data());
     thrust::device_vector<int64_t> ret_vec(ret_strides);
@@ -146,7 +146,7 @@ std::vector<T> ComputeDimStride(const std::vector<T> dim) {
   return dim_strides;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T, int X_DIM_SIZE, int OUT_DIM_SIZE>
 __global__ void DiagonalCuda(const T* data1,
                              T* data2,
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index f9c6a0934dc6a2..abade7ac0ef877 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -21,10 +21,6 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif
 
-#ifdef __MUSACC__
-#include <murand_kernel.h>
-#endif
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/amp_type_traits.h"
@@ -32,7 +28,7 @@ limitations under the License. */
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
@@ -53,7 +49,7 @@ struct exponential_transform {
   explicit exponential_transform(T lambda) : lambda_(lambda) {}
 
   HOSTDEVICE inline T operator()(T val) const {
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     T log = -std::numeric_limits<T>::epsilon() / 2;
     if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
       if (std::is_same<T, double>::value) {
@@ -117,7 +113,7 @@ struct normal_transform {
   T std_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
 namespace kps = phi::kps;
 
@@ -126,19 +122,19 @@ namespace kps = phi::kps;
 template <typename T>
 struct normal_distribution;
 
-#if defined(__MUSACC__)
+#if defined(__NVCC__)
 template <typename T>
 struct uniform_distribution {
-  __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const {
-    return static_cast<T>(murand_uniform(state));
+  __device__ inline T operator()(curandStatePhilox4_32_10_t *state) const {
+    return static_cast<T>(curand_uniform(state));
   }
   static constexpr int kReturnsCount = 1;
 };
 
 template <>
 struct uniform_distribution<float> {
-  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
-    return murand_uniform4(state);
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform4(state);
   }
   static constexpr int kReturnsCount = 4;
 };
@@ -146,16 +142,16 @@ struct uniform_distribution<float> {
 template <>
 struct uniform_distribution<double> {
   __device__ inline double2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
-    return murand_uniform2_double(state);
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };
 
 template <>
 struct uniform_distribution<uint32_t> {
-  __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const {
-    return murand4(state);
+  __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand4(state);
   }
   static constexpr int kReturnsCount = 4;
 };
@@ -163,9 +159,9 @@ struct uniform_distribution<uint32_t> {
 template <>
 struct uniform_distribution<uint64_t> {
   __device__ inline ulonglong2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
+      curandStatePhilox4_32_10_t *state) const {
     ulonglong2 result;
-    uint4 rand = murand4(state);
+    uint4 rand = curand4(state);
     result.x = (uint64_t)rand.x << 32 | rand.y;
     result.y = (uint64_t)rand.z << 32 | rand.w;
     return result;
@@ -175,8 +171,8 @@ struct uniform_distribution<uint64_t> {
 
 template <>
 struct normal_distribution<float> {
-  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
-    return murand_normal4(state);
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_normal4(state);
   }
   static constexpr int kReturnsCount = 4;
 };
@@ -184,8 +180,8 @@ struct normal_distribution<float> {
 template <>
 struct normal_distribution<double> {
   __device__ inline double2 operator()(
-      murandStatePhilox4_32_10_t *state) const {
-    return murand_normal2_double(state);
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_normal2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };
@@ -268,10 +264,10 @@ __global__ void DistributionKernel(size_t size,
                                    size_t stride) {
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
   static constexpr int kCount = DistOp::kReturnsCount;
-#if defined(__MUSACC__)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = murandStatePhilox4_32_10_t;
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = curandStatePhilox4_32_10_t;
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 87283549f8e294..985c028afb2a88 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -20,12 +20,6 @@ limitations under the License. */
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <murand_kernel.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
@@ -152,10 +146,6 @@ __global__ void VectorizedRandomGenerator(
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
-#elif defined(PADDLE_WITH_MUSA)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, idx + THREAD_ID_X, increment, &state);
-  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -226,10 +216,6 @@ __global__ void VectorizedGeneratorMask(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
-#elif defined(PADDLE_WITH_MUSA)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, idx + THREAD_ID_X, increment, &state);
-  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -302,11 +288,6 @@ void DropoutFwGPUKernelDriver(
           hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
@@ -368,7 +349,7 @@ void DropoutFwGPUKernelDriver(
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
       VectorizedRandomGenerator<T>
           <<<grid_size, block_size, 0, stream>>>(0,
                                                  size,
@@ -468,8 +449,6 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
     if (upscale_in_train && dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
-#elif defined(PADDLE_WITH_MUSA)
-      musaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #else
       cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 5b2657704367e5..c92acdaf4180be 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__) || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/function_traits.h"
@@ -150,7 +150,7 @@ class MidWiseTransformIterator<T, CPUContext>
   int64_t post_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 class RowwiseTransformIterator<T, GPUContext>
     : public thrust::iterator_adaptor<RowwiseTransformIterator<T, GPUContext>,
@@ -485,7 +485,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index 9a50c15c101949..eaf527fbba9f6b 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -845,7 +845,7 @@ struct InverseFloorDivideFunctor<dtype::bfloat16> {
   }
 };
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 template <typename T, typename MPType>
 inline HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow(const T a, const T b) {
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 752a2959bea8cc..8e5e45b861a3ae 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -406,7 +406,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
       dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
diff --git a/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
index 032a4dbeb6ce3c..5d4611fa9d09a9 100644
--- a/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
+++ b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
@@ -12,11 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -132,7 +127,7 @@ __global__ void EmbEltwiseLayernormKernel(int hidden,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifdef __CUDACC__  // @{ Half kernel: EmbEltwiseLayernormKernel
+#ifndef __HIPCC__  // @{ Half kernel: EmbEltwiseLayernormKernel
 template <>
 __global__ void EmbEltwiseLayernormKernel<half, 256>(int hidden,
                                                      const int64_t* ids,
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index 17b5b6d80cd28f..107759313069d3 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -89,7 +89,7 @@ __global__ void InplaceAddReluKernel(const int N, const T* bias, T* data) {
 
   for (int i = threadIdx.x; i < N; i += BlockDim) {
     T temp;
-#if defined(__HIPCC__)  || defined(__MUSACC__) || __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__) || __CUDA_ARCH__ >= 350
     temp = __ldg(data + offset + i) + __ldg(bias + i);
 #else
     temp = data[offset + i] + bias[i];
@@ -192,7 +192,7 @@ __global__ void InplaceAddReluKernel(const int N,
   int offset = blockIdx.x * N;
   for (int i = threadIdx.x; i < N; i += BlockDim) {
     half temp;
-#if defined(__HIPCC__)  || defined(__MUSACC__) || __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__) || __CUDA_ARCH__ >= 350
     temp = __hadd(__ldg(data + offset + i), __ldg(bias + i));
 #else
     temp = __hadd(data[offset + i], bias[i]);
@@ -373,7 +373,7 @@ template class FCFunctor<GPUContext, float16>;
 template class FCFunctor<GPUContext, float>;
 template class FCFunctor<GPUContext, double>;
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 template <typename DeviceContext, typename T>
 void FCInt8Functor<DeviceContext, T>::operator()(
     const DeviceContext& context,
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
index dfcd7fad0bb943..c70f615e80fa4d 100644
--- a/paddle/phi/kernels/funcs/fft.cu
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -102,7 +102,7 @@ inline bool use_cache(const int64_t* signal_size) {
   }
   return using_cache;
 }
-#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#elif defined(PADDLE_WITH_HIP)
 inline bool use_cache(const int64_t* signal_size) { return true; }
 #endif
 
@@ -198,11 +198,6 @@ void exec_fft(const phi::GPUContext& ctx,
       phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
-#elif defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mufftSetStream(config->plan(), ctx.stream()));
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mufftSetWorkArea(config->plan(), workspace_tensor.data()));
 #endif
 
   // execution of fft plan
diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h
index a6f775af88ea7e..51e90a6c0d95b5 100644
--- a/paddle/phi/kernels/funcs/fft_cache.h
+++ b/paddle/phi/kernels/funcs/fft_cache.h
@@ -25,8 +25,6 @@
 #include "paddle/phi/kernels/funcs/cufft_util.h"
 #elif defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/funcs/hipfft_util.h"
-#elif defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/kernels/funcs/mufft_util.h"
 #endif
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h
index e89584d5c92f61..ab6d351986ecc2 100644
--- a/paddle/phi/kernels/funcs/fft_fill_conj.h
+++ b/paddle/phi/kernels/funcs/fft_fill_conj.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "thrust/device_vector.h"
 #endif
 
@@ -156,7 +156,7 @@ void FFTFillConj(const DeviceContext& ctx,
     _is_fft_axis[i] = true;
   }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   const thrust::device_vector<int64_t> src_strides_g(src_strides_v);
   const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data());
   const thrust::device_vector<int64_t> dst_strides_g(dst_strides_v);
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index 8bc16a1f32102e..7b6f672f47f1b5 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -42,7 +42,7 @@ struct ForRange<phi::CPUContext> {
   size_t limit_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
 template <typename Function>
 __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cc b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
index 7be86351c47ff6..ca6c44dbdbd761 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cc
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cc
@@ -48,6 +48,24 @@ class ReduceMultiply {
 };
 static ReduceMultiply reduce_mul;
 
+class ReduceMax {
+ public:
+  template <typename tensor_t>
+  void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data > *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMax reduce_max;
+
+class ReduceMin {
+ public:
+  template <typename tensor_t>
+  void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data < *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMin reduce_min;
+
 template <typename tensor_t,
           typename index_t = int64_t,
           bool is_scatter_like = true>
@@ -55,10 +73,11 @@ struct cpu_gather_scatter_functor {
   template <typename func_t>
   void operator()(phi::DenseTensor self,
                   int dim,
-                  const phi::DenseTensor& index UNUSED,
+                  const phi::DenseTensor& index,
                   const phi::DenseTensor& src,
-                  const std::string& method_name UNUSED,
+                  const std::string& method_name,
                   const func_t& reduce_op,
+                  bool include_self,
                   const phi::DeviceContext& ctx UNUSED) {
     if (index.numel() == 0) {
       return;
@@ -96,6 +115,7 @@ struct cpu_gather_scatter_functor {
       outer_dim_size_src *= src_dims[i];
     }
     int64_t index_idx = 0;
+    std::vector<int> nums_of_elements(self.numel(), 0);
     // N layer loop squeezed into 3 layers loop
     for (int64_t i = 0; i < inner_dim_size; i++) {
       for (int64_t j = 0; j < select_dim_size; j++) {
@@ -132,12 +152,31 @@ struct cpu_gather_scatter_functor {
             replace_index_src = k + index * outer_dim_size_src +
                                 i * outer_dim_size_src * src_select_dim_size;
           }
-          reduce_op((tensor_t*)(self_data + replace_index_self),  // NOLINT
-                    (tensor_t*)(src_data + replace_index_src));   // NOLINT
+          if (include_self == false &&
+              nums_of_elements[replace_index_self] == 0) {
+            self_data[replace_index_self] = src_data[replace_index_src];
+          } else {
+            reduce_op((tensor_t*)(self_data + replace_index_self),  // NOLINT
+                      (tensor_t*)(src_data + replace_index_src));   // NOLINT
+          }
+          nums_of_elements[replace_index_self] += 1;
           index_idx++;
         }
       }
     }
+    if (method_name == "scatter_mean_cpu") {
+      for (int i = 0; i < self_size; i++) {
+        if (nums_of_elements[i]) {
+          if (include_self) {
+            self_data[i] =
+                self_data[i] / static_cast<tensor_t>(nums_of_elements[i] + 1);
+          } else {
+            self_data[i] =
+                self_data[i] / static_cast<tensor_t>(nums_of_elements[i]);
+          }
+        }
+      }
+    }
   }
 };
 
@@ -146,11 +185,18 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/false>()(
-      result, dim, index, self, "gather_out_cpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/false>()(result,
+                                                          dim,
+                                                          index,
+                                                          self,
+                                                          "gather_out_cpu",
+                                                          tensor_assign,
+                                                          include_self,
+                                                          ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -158,11 +204,18 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_assign_cpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_assign_cpu",
+                                                         tensor_assign,
+                                                         include_self,
+                                                         ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -170,11 +223,12 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_add_cpu", reduce_add, ctx);
+      self, dim, index, src, "scatter_add_cpu", reduce_add, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -182,11 +236,51 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mul_cpu", reduce_mul, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mean_cpu", reduce_add, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  cpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_max_cpu", reduce_max, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   cpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mul_cpu", reduce_mul, ctx);
+      self, dim, index, src, "scatter_min_cpu", reduce_min, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -194,6 +288,7 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx UNUSED) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -229,11 +324,135 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self UNUSED,
   }
 }
 
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self UNUSED,
+                                               const phi::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+
+  int64_t grad_size = grad.numel();
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+  auto value_dims = value.dims();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t outer_dim_size_value = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  int64_t value_select_dim_size = value_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+    outer_dim_size_value *= value_dims[i];
+  }
+
+  int64_t index_idx = 0;
+  std::vector<int> num_elements(grad_size, 0);
+  for (int64_t i = 0; i < inner_dim_size; i++) {
+    for (int64_t j = 0; j < select_dim_size; j++) {
+      for (int64_t k = 0; k < outer_dim_size; k++) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index_grad =
+            k + index * outer_dim_size_grad +
+            i * outer_dim_size_grad * grad_select_dim_size;
+        if ((reduce == "multiply" || reduce == "mul") &&
+            num_elements[replace_index_grad] == 0) {
+          grad_data[replace_index_grad] = static_cast<tensor_t>(
+              grad_data[replace_index_grad] * out_data[replace_index_grad] /
+              x_data[replace_index_grad]);
+          num_elements[replace_index_grad] += 1;
+        } else if (reduce == "amin" || reduce == "amax") {
+          if (out_data[replace_index_grad] != x_data[replace_index_grad]) {
+            grad_data[replace_index_grad] = 0;
+          } else {
+            int64_t replace_index_value =
+                k + j * outer_dim_size_value +
+                i * outer_dim_size_value * value_select_dim_size;
+            if (out_data[replace_index_grad] == value_data[replace_index_value])
+              num_elements[replace_index_grad] += 1;
+          }
+        }
+        index_idx++;
+      }
+    }
+  }
+  if (reduce == "amin" || reduce == "amax") {
+    for (int64_t i = 0; i < grad_size; i++) {
+      grad_data[i] = grad_data[i] / static_cast<tensor_t>(num_elements[i] + 1);
+    }
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self UNUSED,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self UNUSED,
+                                        const phi::DeviceContext& ctx UNUSED) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t grad_size = grad.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_data = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_data *= grad_dims[i];
+  }
+
+  int64_t index_idx = 0;
+  std::vector<int> num_elements(grad_size, 0);
+  for (int64_t i = 0; i < inner_dim_size; i++) {
+    for (int64_t j = 0; j < select_dim_size; j++) {
+      for (int64_t k = 0; k < outer_dim_size; k++) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index = k + index * outer_dim_size_data +
+                                i * outer_dim_size_data * grad_select_dim_size;
+        num_elements[replace_index] += 1;
+        index_idx++;
+      }
+    }
+  }
+  for (int64_t i = 0; i < grad_size; i++)
+    if (num_elements[i])
+      grad_data[i] = grad_data[i] / static_cast<tensor_t>(num_elements[i] + 1);
+}
+
 template <typename tensor_t, typename index_t>
 void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx UNUSED) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
@@ -244,11 +463,75 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
   auto grad_dims = grad.dims();
 
   int64_t self_size = self.numel();
-  int64_t grad_size = grad.numel();
-  bool* is_self_grad_used = new bool[self_size];
+  std::vector<bool> is_self_grad_used(self_size, false);
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int64_t index_idx = index.numel() - 1;
+  for (int64_t i = inner_dim_size - 1; i >= 0; i--) {
+    for (int64_t j = select_dim_size - 1; j >= 0; j--) {
+      for (int64_t k = outer_dim_size - 1; k >= 0; k--) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index_self =
+            k + index * outer_dim_size_self +
+            i * outer_dim_size_self * self_select_dim_size;
+        int64_t replace_index_grad =
+            k + j * outer_dim_size_grad +
+            i * outer_dim_size_grad * grad_select_dim_size;
+        if (!is_self_grad_used[replace_index_self]) {
+          grad_data[replace_index_grad] = self_data[replace_index_self];
+          is_self_grad_used[replace_index_self] = true;
+        }
+        index_idx--;
+      }
+    }
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_add_mean_value_grad_kernel(
+    phi::DenseTensor self,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out UNUSED,
+    const phi::DenseTensor& x UNUSED,
+    const phi::DenseTensor& value UNUSED,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self,
+    const phi::DeviceContext& ctx UNUSED) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
 
-  for (int i = 0; i < self_size; i++) {
-    is_self_grad_used[i] = false;
+  int64_t self_size = self.numel();
+  int64_t grad_size = grad.numel();
+  std::vector<int> num_elements;
+  if (reduce == "mean") {
+    for (int i = 0; i < self_size; i++) {
+      if (include_self)
+        num_elements.push_back(1);
+      else
+        num_elements.push_back(0);
+    }
   }
 
   int64_t inner_dim_size = 1;
@@ -267,10 +550,25 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
     outer_dim_size_self *= self_dims[i];
     outer_dim_size_grad *= grad_dims[i];
   }
-  int64_t index_idx = index.numel() - 1;
   for (int i = 0; i < grad_size; i++) {
     grad_data[i] = static_cast<tensor_t>(0);
   }
+  int64_t index_idx = index.numel() - 1;
+  if (reduce == "mean") {
+    for (int64_t i = inner_dim_size - 1; i >= 0; i--) {
+      for (int64_t j = select_dim_size - 1; j >= 0; j--) {
+        for (int64_t k = outer_dim_size - 1; k >= 0; k--) {
+          int64_t index = index_data[index_idx];
+          int64_t replace_index_self =
+              k + index * outer_dim_size_self +
+              i * outer_dim_size_self * self_select_dim_size;
+          num_elements[replace_index_self] += 1;
+          index_idx--;
+        }
+      }
+    }
+    index_idx = index.numel() - 1;
+  }
   for (int64_t i = inner_dim_size - 1; i >= 0; i--) {
     for (int64_t j = select_dim_size - 1; j >= 0; j--) {
       for (int64_t k = outer_dim_size - 1; k >= 0; k--) {
@@ -281,23 +579,131 @@ void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
         int64_t replace_index_grad =
             k + j * outer_dim_size_grad +
             i * outer_dim_size_grad * grad_select_dim_size;
-        if (!is_self_grad_used[replace_index_self]) {
+        if (reduce == "add")
           grad_data[replace_index_grad] = self_data[replace_index_self];
-          is_self_grad_used[replace_index_self] = true;
-        }
+        else if (reduce == "mean")
+          grad_data[replace_index_grad] =
+              self_data[replace_index_self] /
+              static_cast<tensor_t>(num_elements[replace_index_self]);
         index_idx--;
       }
     }
   }
-  delete[] is_self_grad_used;
 }
 
-Instantiate_Template_Function(cpu_gather_kernel)
-    Instantiate_Template_Function(cpu_scatter_assign_kernel)
-        Instantiate_Template_Function(cpu_scatter_add_kernel)
-            Instantiate_Template_Function(cpu_scatter_mul_kernel)
-                Instantiate_Template_Function(cpu_scatter_input_grad_kernel)
-                    Instantiate_Template_Function(cpu_scatter_value_grad_kernel)
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t self_size = self.numel();
+  std::vector<int> num_elements;
+  if (reduce == "amin" || reduce == "amax") {
+    for (int i = 0; i < self_size; i++) {
+      num_elements.push_back(0);
+    }
+  }
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int64_t index_idx = 0;
+  for (int64_t i = 0; i < inner_dim_size; i++) {
+    for (int64_t j = 0; j < select_dim_size; j++) {
+      for (int64_t k = 0; k < outer_dim_size; k++) {
+        int64_t index = index_data[index_idx];
+        int64_t replace_index_self =
+            k + index * outer_dim_size_self +
+            i * outer_dim_size_self * self_select_dim_size;
+        int64_t replace_index_grad =
+            k + j * outer_dim_size_grad +
+            i * outer_dim_size_grad * grad_select_dim_size;
+        if ((reduce == "amin" || reduce == "amax") &&
+            out_data[replace_index_self] == value_data[replace_index_grad]) {
+          num_elements[replace_index_self] += 1;
+        } else if (reduce == "mul" || reduce == "multiply") {
+          grad_data[replace_index_grad] =
+              self_data[replace_index_self] *
+              (out_data[replace_index_self] / value_data[replace_index_grad]);
+        }
+        index_idx++;
+      }
+    }
+  }
+  if (reduce == "amin" || reduce == "amax") {
+    index_idx = 0;
+    for (int64_t i = 0; i < inner_dim_size; i++) {
+      for (int64_t j = 0; j < select_dim_size; j++) {
+        for (int64_t k = 0; k < outer_dim_size; k++) {
+          int64_t index = index_data[index_idx];
+          int64_t replace_index_self =
+              k + index * outer_dim_size_self +
+              i * outer_dim_size_self * self_select_dim_size;
+          int64_t replace_index_grad =
+              k + j * outer_dim_size_grad +
+              i * outer_dim_size_grad * grad_select_dim_size;
+          if (out_data[replace_index_self] == value_data[replace_index_grad]) {
+            if (out_data[replace_index_self] == x_data[replace_index_self])
+              grad_data[replace_index_grad] =
+                  self_data[replace_index_self] /
+                  static_cast<tensor_t>(num_elements[replace_index_self] + 1);
+            else
+              grad_data[replace_index_grad] =
+                  self_data[replace_index_self] /
+                  static_cast<tensor_t>(num_elements[replace_index_self]);
+          }
+          index_idx++;
+        }
+      }
+    }
+  }
+}
+
+Instantiate_Template_Function(cpu_gather_kernel)                  // NOLINT
+    Instantiate_Template_Function(cpu_scatter_assign_kernel)      // NOLINT
+    Instantiate_Template_Function(cpu_scatter_add_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_mul_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_mean_kernel)        // NOLINT
+    Instantiate_Template_Function(cpu_scatter_max_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_min_kernel)         // NOLINT
+    Instantiate_Template_Function(cpu_scatter_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function(cpu_scatter_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        cpu_scatter_mul_min_max_input_grad_kernel)                     // NOLINT
+    Instantiate_Template_Function(cpu_scatter_mean_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        cpu_scatter_add_mean_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        cpu_scatter_mul_min_max_value_grad_kernel)  // NOLINT
 
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
index cbe866d4924d54..865b1d74e36c34 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.cu
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/gather_scatter_functor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 namespace funcs {
@@ -46,14 +47,58 @@ static ReduceAdd reduce_add;
 
 class ReduceMul {
  public:
-  template <typename tensor_t>
+  template <
+      typename tensor_t,
+      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    phi::CudaAtomicMul(self_data, *src_data);
+  }
+  template <typename tensor_t,
+            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
   __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
     *self_data *= *src_data;
-    // TODO(huangxu96) platform::CudaAtomicMul(*self_data, *src_data);
   }
 };
 static ReduceMul reduce_mul;
 
+class ReduceMax {
+ public:
+  template <
+      typename tensor_t,
+      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    phi::CudaAtomicMax(self_data, *src_data);
+  }
+  template <typename tensor_t,
+            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data > *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMax reduce_max;
+
+class ReduceMin {
+ public:
+  template <
+      typename tensor_t,
+      std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    phi::CudaAtomicMin(self_data, *src_data);
+  }
+  template <typename tensor_t,
+            std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
+  __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
+    *self_data = *src_data < *self_data ? *src_data : *self_data;
+  }
+};
+static ReduceMin reduce_min;
+
+__global__ void CudaMemsetAsync(int* dest, int value, size_t size) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid * sizeof(int) >= size) return;
+  dest[tid] = value;
+}
+
 template <typename tensor_t,
           typename index_t,
           typename func_t,
@@ -70,17 +115,10 @@ __global__ void ScatterAssignGPUKernel(tensor_t* self_data,
                                        int64_t outer_dim_size_src,
                                        int64_t numel,
                                        int64_t numel_data,
-                                       const func_t& reduce_op) {
+                                       const func_t& reduce_op,
+                                       int* thread_ids) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= numel) return;
-  extern __shared__ int thread_ids[];
-
-  if (tid == 0) {
-    for (int i = 0; i < numel_data; i++) {
-      thread_ids[i] = 0;
-    }
-  }
-  __syncthreads();
   int64_t i, j, k;  // The i, j, k here is the index of the 3 layers loop
                     // squeezed from the N layers loop.
   /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */
@@ -143,9 +181,19 @@ __global__ void GatherScatterGPUKernel(tensor_t* self_data,
                                        int64_t outer_dim_size_src,
                                        int64_t numel,
                                        int64_t numel_data,
-                                       const func_t& reduce_op) {
+                                       bool include_self,
+                                       const func_t& reduce_op,
+                                       int* shared_mem) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= numel) return;
+  if (include_self == false) {
+    if (tid == 0) {
+      for (int i = 0; i < numel_data; i++) {
+        shared_mem[i] = numel + 1;  // thread_ids
+      }
+    }
+    __syncthreads();
+  }
   int64_t i, j, k;  // The i, j, k here is the index of the 3 layers loop
                     // squeezed from the N layers loop.
   /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */
@@ -182,9 +230,95 @@ __global__ void GatherScatterGPUKernel(tensor_t* self_data,
     replace_index_src = k + index * outer_dim_size_src +
                         i * outer_dim_size_src * src_select_dim_size;
   }
+  bool is_op_done = false;
+  if (include_self == false) {
+    phi::CudaAtomicMin(shared_mem + replace_index_self, tid);
+    __syncthreads();
+    if (tid == shared_mem[replace_index_self]) {
+      self_data[replace_index_self] = src_data[replace_index_src];
+      is_op_done = true;
+    }
+    __syncthreads();
+  }
+  if (!is_op_done)
+    reduce_op(static_cast<tensor_t*>(self_data + replace_index_self),
+              static_cast<tensor_t*>(src_data + replace_index_src));
+}
 
+template <typename tensor_t,
+          typename index_t,
+          typename func_t,
+          bool is_scatter_like = true>
+__global__ void ScatterMeanGPUKernel(tensor_t* self_data,
+                                     int dim,
+                                     const index_t* index_data,
+                                     tensor_t* src_data,
+                                     int select_dim_size,
+                                     int self_select_dim_size,
+                                     int src_select_dim_size,
+                                     int64_t outer_dim_size,
+                                     int64_t outer_dim_size_self,
+                                     int64_t outer_dim_size_src,
+                                     int64_t numel,
+                                     int64_t numel_data,
+                                     bool include_self,
+                                     const func_t& reduce_op,
+                                     int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+
+  int64_t i, j, k;  // The i, j, k here is the index of the 3 layers loop
+                    // squeezed from the N layers loop.
+  /* tid = i * select_dim_size * outer_dim_size + j * outer_dim_size + k */
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  /*
+    gather computation formula:
+
+    self[i][j][k] = src[index[i][j][k]][j][k]  # if dim == 0
+    self[i][j][k] = src[i][index[i][j][k]][k]  # if dim == 1
+    self[i][j][k] = src[i][j][index[i][j][k]]  # if dim == 2
+
+    scatter computation formula:
+
+    self[index[i][j][k]][j][k] = src[i][j][k]  # if dim == 0
+    self[i][index[i][j][k]][k] = src[i][j][k]  # if dim == 1
+    self[i][j][index[i][j][k]] = src[i][j][k]  # if dim == 2
+
+  */
+  // index matrix has different shape with self matrix or src matrix.
+  int64_t replace_index_self, replace_index_src;
+  if (is_scatter_like) {
+    replace_index_self = k + index * outer_dim_size_self +
+                         i * outer_dim_size_self * self_select_dim_size;
+
+    replace_index_src = k + j * outer_dim_size_src +
+                        i * outer_dim_size_src * src_select_dim_size;
+  } else {
+    replace_index_self = tid;
+
+    replace_index_src = k + index * outer_dim_size_src +
+                        i * outer_dim_size_src * src_select_dim_size;
+  }
+  if (include_self == false) {
+    self_data[replace_index_self] = 0;
+    __syncthreads();
+  }
   reduce_op(static_cast<tensor_t*>(self_data + replace_index_self),
             static_cast<tensor_t*>(src_data + replace_index_src));
+
+  phi::CudaAtomicMax(shared_mem + replace_index_self, tid);
+  phi::CudaAtomicAdd(shared_mem + numel_data + replace_index_self, 1);
+  __syncthreads();
+
+  if (tid == shared_mem[replace_index_self]) {
+    self_data[replace_index_self] =
+        self_data[replace_index_self] /
+        static_cast<tensor_t>(shared_mem[replace_index_self + numel_data]);
+  }
 }
 
 template <typename tensor_t,
@@ -198,6 +332,7 @@ struct gpu_gather_scatter_functor {
                   phi::DenseTensor src,
                   const std::string& method_name,
                   const func_t& reduce_op,
+                  bool include_self,
                   const phi::DeviceContext& ctx) {
     if (index.numel() == 0) {
       return;
@@ -234,24 +369,64 @@ struct gpu_gather_scatter_functor {
     int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
     int64_t grid = (n + block - 1) / block;
     auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+    DenseTensor shared_mem_tensor;
     if (method_name == "scatter_assign_gpu") {
-      int shared_mem_size =
-          is_scatter_like ? sizeof(int) * self_size : sizeof(int) * index_size;
+      shared_mem_tensor.Resize({self_size});
+      ctx.Alloc<int>(&shared_mem_tensor);
+      phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+
+      int* shared_mem = shared_mem_tensor.data<int>();
       ScatterAssignGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
-          <<<grid, block, shared_mem_size, stream>>>(self_data,
-                                                     dim,
-                                                     index_data,
-                                                     src_data,
-                                                     select_dim_size,
-                                                     self_select_dim_size,
-                                                     src_select_dim_size,
-                                                     outer_dim_size,
-                                                     outer_dim_size_self,
-                                                     outer_dim_size_src,
-                                                     index_size,
-                                                     self_size,
-                                                     reduce_op);
+          <<<grid, block, 0, stream>>>(self_data,
+                                       dim,
+                                       index_data,
+                                       src_data,
+                                       select_dim_size,
+                                       self_select_dim_size,
+                                       src_select_dim_size,
+                                       outer_dim_size,
+                                       outer_dim_size_self,
+                                       outer_dim_size_src,
+                                       index_size,
+                                       self_size,
+                                       reduce_op,
+                                       shared_mem);
+    } else if (method_name == "scatter_mean_gpu") {
+      shared_mem_tensor.Resize({self_size * 2});
+      ctx.Alloc<int>(&shared_mem_tensor);
+      if (include_self) {
+        int64_t grid_memset = (self_size * 2 + block - 1) / block;
+        phi::funcs::set_constant(ctx, &shared_mem_tensor, 1);
+      } else {
+        phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+      }
+
+      int* shared_mem = shared_mem_tensor.data<int>();
+      ScatterMeanGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
+          <<<grid, block, 0, stream>>>(self_data,
+                                       dim,
+                                       index_data,
+                                       src_data,
+                                       select_dim_size,
+                                       self_select_dim_size,
+                                       src_select_dim_size,
+                                       outer_dim_size,
+                                       outer_dim_size_self,
+                                       outer_dim_size_src,
+                                       index_size,
+                                       self_size,
+                                       include_self,
+                                       reduce_op,
+                                       shared_mem);
     } else {
+      int* shared_mem = nullptr;
+      if (include_self == false) {
+        shared_mem_tensor.Resize({self_size});
+        ctx.Alloc<int>(&shared_mem_tensor);
+        phi::funcs::set_constant(ctx, &shared_mem_tensor, index_size + 1);
+
+        shared_mem = shared_mem_tensor.data<int>();
+      }
       GatherScatterGPUKernel<tensor_t, index_t, func_t, is_scatter_like>
           <<<grid, block, 0, stream>>>(self_data,
                                        dim,
@@ -265,7 +440,9 @@ struct gpu_gather_scatter_functor {
                                        outer_dim_size_src,
                                        index_size,
                                        self_size,
-                                       reduce_op);
+                                       include_self,
+                                       reduce_op,
+                                       shared_mem);
     }
   }
 };  // struct gpu_gather_scatter_functor
@@ -275,11 +452,18 @@ void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/false>()(
-      result, dim, index, self, "gather_out_gpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/false>()(result,
+                                                          dim,
+                                                          index,
+                                                          self,
+                                                          "gather_out_gpu",
+                                                          tensor_assign,
+                                                          include_self,
+                                                          ctx);
   return;
 }
 
@@ -288,11 +472,18 @@ void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
-                             /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_assign_gpu", tensor_assign, ctx);
+                             /*is_scatter_like=*/true>()(self,
+                                                         dim,
+                                                         index,
+                                                         src,
+                                                         "scatter_assign_gpu",
+                                                         tensor_assign,
+                                                         include_self,
+                                                         ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -300,11 +491,12 @@ void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_add_gpu", reduce_add, ctx);
+      self, dim, index, src, "scatter_add_gpu", reduce_add, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -312,11 +504,51 @@ void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mul_gpu", reduce_mul, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_mean_gpu", reduce_add, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx) {
   gpu_gather_scatter_functor<tensor_t,
                              index_t,
                              /*is_scatter_like=*/true>()(
-      self, dim, index, src, "scatter_mul_gpu", reduce_mul, ctx);
+      self, dim, index, src, "scatter_max_gpu", reduce_max, include_self, ctx);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx) {
+  gpu_gather_scatter_functor<tensor_t,
+                             index_t,
+                             /*is_scatter_like=*/true>()(
+      self, dim, index, src, "scatter_min_gpu", reduce_min, include_self, ctx);
 }
 
 template <typename tensor_t, typename index_t>
@@ -347,6 +579,7 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx) {
   auto* index_data = index.data<index_t>();
   auto* grad_data = grad.data<tensor_t>();
@@ -374,17 +607,265 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
   int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
   int64_t grid = (n + block - 1) / block;
   auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
-  int shared_mem_size = sizeof(int) * grad_size;
   ScatterInputGradGPUKernel<tensor_t, index_t>
-      <<<grid, block, shared_mem_size, stream>>>(grad_data,
-                                                 dim,
-                                                 index_data,
-                                                 select_dim_size,
-                                                 grad_select_dim_size,
-                                                 outer_dim_size,
-                                                 outer_dim_size_data,
-                                                 index_size,
-                                                 grad_size);
+      <<<grid, block, 0, stream>>>(grad_data,
+                                   dim,
+                                   index_data,
+                                   select_dim_size,
+                                   grad_select_dim_size,
+                                   outer_dim_size,
+                                   outer_dim_size_data,
+                                   index_size,
+                                   grad_size);
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMulInputGradGPUKernel(tensor_t* grad_data,
+                                             int dim,
+                                             const index_t* index_data,
+                                             const tensor_t* out_data,
+                                             const tensor_t* x_data,
+                                             int select_dim_size,
+                                             int grad_select_dim_size,
+                                             int64_t outer_dim_size,
+                                             int64_t outer_dim_size_grad,
+                                             int64_t numel,
+                                             int64_t numel_grad,
+                                             int* thread_ids) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index = k + index * outer_dim_size_grad +
+                          i * outer_dim_size_grad * grad_select_dim_size;
+  atomicMax(thread_ids + replace_index, tid);
+  __syncthreads();
+  if (tid == thread_ids[replace_index]) {
+    grad_data[replace_index] = grad_data[replace_index] *
+                               out_data[replace_index] / x_data[replace_index];
+  }
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMinMaxInputGradGPUKernel(tensor_t* grad_data,
+                                                int dim,
+                                                const index_t* index_data,
+                                                const tensor_t* out_data,
+                                                const tensor_t* x_data,
+                                                const tensor_t* value_data,
+                                                const tensor_t* self_data,
+                                                int select_dim_size,
+                                                int grad_select_dim_size,
+                                                int value_select_dim_size,
+                                                int64_t outer_dim_size,
+                                                int64_t outer_dim_size_grad,
+                                                int64_t outer_dim_size_value,
+                                                int64_t numel,
+                                                int64_t numel_grad,
+                                                const std::string& reduce,
+                                                int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index = k + index * outer_dim_size_grad +
+                          i * outer_dim_size_grad * grad_select_dim_size;
+  int64_t replace_index_value =
+      k + j * outer_dim_size_value +
+      i * outer_dim_size_value * value_select_dim_size;
+  if (value_data[replace_index_value] == out_data[replace_index])
+    phi::CudaAtomicAdd(shared_mem + replace_index, 1);
+  __syncthreads();
+  if (out_data[replace_index] != x_data[replace_index]) {
+    grad_data[replace_index] = 0;
+  } else {
+    grad_data[replace_index] = self_data[replace_index] /
+                               static_cast<tensor_t>(shared_mem[replace_index]);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value
+                                                   UNUSED,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self UNUSED,
+                                               const phi::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+  auto* self_data = self.data<tensor_t>();
+
+  int64_t grad_size = grad.numel();
+  int64_t index_size = index.numel();
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+  auto x_dims = x.dims();
+  auto value_dims = value.dims();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t outer_dim_size_value = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  int64_t value_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+    outer_dim_size_value *= value_dims[i];
+  }
+  int block = 512;
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  DenseTensor shared_mem_tensor;
+  shared_mem_tensor.Resize({grad_size});
+  ctx.Alloc<int>(&shared_mem_tensor);
+  int* shared_mem = shared_mem_tensor.data<int>();
+  if (reduce == "mul" || reduce == "multiply") {
+    phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+    ScatterMulInputGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     out_data,
+                                     x_data,
+                                     select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_grad,
+                                     index_size,
+                                     grad_size,
+                                     shared_mem);
+  } else if (reduce == "amin" || reduce == "amax") {
+    phi::funcs::set_constant(ctx, &shared_mem_tensor, 1);
+    ScatterMinMaxInputGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     out_data,
+                                     x_data,
+                                     value_data,
+                                     self_data,
+                                     select_dim_size,
+                                     grad_select_dim_size,
+                                     value_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_grad,
+                                     outer_dim_size_value,
+                                     index_size,
+                                     grad_size,
+                                     reduce,
+                                     shared_mem);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMeanInputGradGPUKernel(tensor_t* grad_data,
+                                              int dim,
+                                              const index_t* index_data,
+                                              int select_dim_size,
+                                              int grad_select_dim_size,
+                                              int64_t outer_dim_size,
+                                              int64_t outer_dim_size_grad,
+                                              int64_t numel,
+                                              int64_t numel_grad,
+                                              int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index = k + index * outer_dim_size_grad +
+                          i * outer_dim_size_grad * grad_select_dim_size;
+  atomicMax(shared_mem + replace_index, tid);
+  phi::CudaAtomicAdd(shared_mem + numel_grad + replace_index, 1);
+  __syncthreads();
+  if (tid == shared_mem[replace_index]) {
+    grad_data[replace_index] =
+        grad_data[replace_index] /
+        static_cast<tensor_t>(shared_mem[numel_grad + replace_index]);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_input_grad_kernel(phi::DenseTensor self,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self UNUSED,
+                                        const phi::DeviceContext& ctx) {
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t grad_size = grad.numel();
+  int64_t index_size = index.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+
+  DenseTensor shared_mem_tensor;
+  shared_mem_tensor.Resize({grad_size * 2});
+  ctx.Alloc<int>(&shared_mem_tensor);
+  phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+  int* shared_mem = shared_mem_tensor.data<int>();
+
+  int block = 512;
+  int64_t grid_memset = (grad_size + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  CudaMemsetAsync<<<grid_memset, block, 0, stream>>>(
+      shared_mem + grad_size, 1, sizeof(int) * grad_size);
+
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  ScatterMeanInputGradGPUKernel<tensor_t, index_t>
+      <<<grid, block, 0, stream>>>(grad_data,
+                                   dim,
+                                   index_data,
+                                   select_dim_size,
+                                   grad_select_dim_size,
+                                   outer_dim_size,
+                                   outer_dim_size_grad,
+                                   index_size,
+                                   grad_size,
+                                   shared_mem);
 }
 
 template <typename tensor_t, typename index_t>
@@ -399,17 +880,11 @@ __global__ void ScatterValueGradGPUKernel(tensor_t* grad_data,
                                           int64_t outer_dim_size_self,
                                           int64_t outer_dim_size_grad,
                                           int64_t numel,
-                                          int64_t numel_data) {
+                                          int64_t numel_data,
+                                          int* thread_ids) {
   int tid = threadIdx.x + blockIdx.x * blockDim.x;
   if (tid >= numel) return;
-  extern __shared__ int thread_ids[];
 
-  if (tid == 0) {
-    for (int i = 0; i < numel_data; i++) {
-      thread_ids[i] = 0;
-    }
-  }
-  __syncthreads();
   int64_t i, j, k;
   i = tid / (select_dim_size * outer_dim_size);
   int64_t remind = tid % (select_dim_size * outer_dim_size);
@@ -418,7 +893,6 @@ __global__ void ScatterValueGradGPUKernel(tensor_t* grad_data,
   index_t index = index_data[tid];
   int64_t replace_index_self = k + index * outer_dim_size_self +
                                i * outer_dim_size_self * self_select_dim_size;
-
   atomicMax(thread_ids + replace_index_self, tid);
   __syncthreads();
 
@@ -433,6 +907,7 @@ void gpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self UNUSED,
                                    const phi::DeviceContext& ctx) {
   auto* self_data = self.data<tensor_t>();
   auto* index_data = index.data<index_t>();
@@ -461,30 +936,362 @@ void gpu_scatter_value_grad_kernel(phi::DenseTensor self,
     outer_dim_size_grad *= grad_dims[i];
   }
 
+  DenseTensor shared_mem_tensor;
+  shared_mem_tensor.Resize({self_size});
+  ctx.Alloc<int>(&shared_mem_tensor);
+  phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+  int* shared_mem = shared_mem_tensor.data<int>();
+
   int block = 512;
   int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
   int64_t grid = (n + block - 1) / block;
   auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
-  int shared_mem_size = sizeof(int) * self_size;
   ScatterValueGradGPUKernel<tensor_t, index_t>
-      <<<grid, block, shared_mem_size, stream>>>(grad_data,
-                                                 dim,
-                                                 self_data,
-                                                 index_data,
-                                                 select_dim_size,
-                                                 self_select_dim_size,
-                                                 grad_select_dim_size,
-                                                 outer_dim_size,
-                                                 outer_dim_size_self,
-                                                 outer_dim_size_grad,
-                                                 index_size,
-                                                 self_size);
+      <<<grid, block, 0, stream>>>(grad_data,
+                                   dim,
+                                   self_data,
+                                   index_data,
+                                   select_dim_size,
+                                   self_select_dim_size,
+                                   grad_select_dim_size,
+                                   outer_dim_size,
+                                   outer_dim_size_self,
+                                   outer_dim_size_grad,
+                                   index_size,
+                                   self_size,
+                                   shared_mem);
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMeanValueGradGPUKernel(tensor_t* grad_data,
+                                              int dim,
+                                              const tensor_t* self_data,
+                                              const index_t* index_data,
+                                              int select_dim_size,
+                                              int self_select_dim_size,
+                                              int grad_select_dim_size,
+                                              int64_t outer_dim_size,
+                                              int64_t outer_dim_size_self,
+                                              int64_t outer_dim_size_grad,
+                                              int64_t numel,
+                                              int64_t numel_self,
+                                              int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+
+  phi::CudaAtomicAdd(shared_mem + replace_index_self, 1);
+  __syncthreads();
+
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  grad_data[replace_index_grad] =
+      self_data[replace_index_self] /
+      static_cast<tensor_t>(shared_mem[replace_index_self]);
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterAddValueGradGPUKernel(tensor_t* grad_data,
+                                             int dim,
+                                             const tensor_t* self_data,
+                                             const index_t* index_data,
+                                             int select_dim_size,
+                                             int self_select_dim_size,
+                                             int grad_select_dim_size,
+                                             int64_t outer_dim_size,
+                                             int64_t outer_dim_size_self,
+                                             int64_t outer_dim_size_grad,
+                                             int64_t numel) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  grad_data[replace_index_grad] = self_data[replace_index_self];
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_add_mean_value_grad_kernel(
+    phi::DenseTensor self,
+    int dim,
+    const phi::DenseTensor& index,
+    const phi::DenseTensor& out UNUSED,
+    const phi::DenseTensor& x UNUSED,
+    const phi::DenseTensor& value UNUSED,
+    phi::DenseTensor grad,
+    const std::string& reduce,
+    bool include_self,
+    const phi::DeviceContext& ctx UNUSED) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t self_size = self.numel();
+  int64_t grad_size = grad.numel();
+  int64_t index_size = index.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int block = 512;
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  if (reduce == "mean") {
+    DenseTensor shared_mem_tensor;
+    shared_mem_tensor.Resize({self_size});
+    ctx.Alloc<int>(&shared_mem_tensor);
+    if (include_self) {
+      phi::funcs::set_constant(ctx, &shared_mem_tensor, 1);
+    } else {
+      phi::funcs::set_constant(ctx, &shared_mem_tensor, 0);
+    }
+    int* shared_mem = shared_mem_tensor.data<int>();
+    ScatterMeanValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     self_data,
+                                     index_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size,
+                                     self_size,
+                                     shared_mem);
+  } else if (reduce == "add") {
+    ScatterAddValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     self_data,
+                                     index_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size);
+  }
+}
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMulValueGradGPUKernel(tensor_t* grad_data,
+                                             int dim,
+                                             const index_t* index_data,
+                                             const tensor_t* self_data,
+                                             const tensor_t* value_data,
+                                             const tensor_t* out_data,
+                                             int select_dim_size,
+                                             int self_select_dim_size,
+                                             int grad_select_dim_size,
+                                             int64_t outer_dim_size,
+                                             int64_t outer_dim_size_self,
+                                             int64_t outer_dim_size_grad,
+                                             int64_t numel) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  grad_data[replace_index_grad] =
+      self_data[replace_index_self] *
+      (out_data[replace_index_self] / value_data[replace_index_grad]);
 }
-Instantiate_Template_Function(gpu_gather_kernel)
-    Instantiate_Template_Function(gpu_scatter_assign_kernel)
-        Instantiate_Template_Function(gpu_scatter_add_kernel)
-            Instantiate_Template_Function(gpu_scatter_mul_kernel)
-                Instantiate_Template_Function(gpu_scatter_input_grad_kernel)
-                    Instantiate_Template_Function(gpu_scatter_value_grad_kernel)
+
+template <typename tensor_t, typename index_t>
+__global__ void ScatterMinMaxValueGradGPUKernel(tensor_t* grad_data,
+                                                int dim,
+                                                const index_t* index_data,
+                                                const tensor_t* self_data,
+                                                const tensor_t* value_data,
+                                                const tensor_t* out_data,
+                                                const tensor_t* x_data,
+                                                int select_dim_size,
+                                                int self_select_dim_size,
+                                                int grad_select_dim_size,
+                                                int64_t outer_dim_size,
+                                                int64_t outer_dim_size_self,
+                                                int64_t outer_dim_size_grad,
+                                                int64_t numel,
+                                                int64_t numel_self,
+                                                bool include_self,
+                                                int* shared_mem) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= numel) return;
+  int64_t i, j, k;
+  i = tid / (select_dim_size * outer_dim_size);
+  int64_t remind = tid % (select_dim_size * outer_dim_size);
+  j = remind / outer_dim_size;
+  k = remind % outer_dim_size;
+  index_t index = index_data[tid];
+  int64_t replace_index_self = k + index * outer_dim_size_self +
+                               i * outer_dim_size_self * self_select_dim_size;
+  int64_t replace_index_grad = k + j * outer_dim_size_grad +
+                               i * outer_dim_size_grad * grad_select_dim_size;
+  if (tid == 0) {
+    for (int i = 0; i < numel_self; i++) {
+      if (include_self &&
+          x_data[replace_index_self] == out_data[replace_index_self])
+        shared_mem[i] = 1;
+      else
+        shared_mem[i] = 0;  // number of elements
+    }
+  }
+  __syncthreads();
+  grad_data[replace_index_grad] = 0;
+  if (value_data[replace_index_grad] == out_data[replace_index_self])
+    phi::CudaAtomicAdd(shared_mem + replace_index_self, 1);
+  __syncthreads();
+  if (value_data[replace_index_grad] == out_data[replace_index_self])
+    grad_data[replace_index_grad] =
+        self_data[replace_index_self] /
+        static_cast<tensor_t>(shared_mem[replace_index_self]);
+}
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx) {
+  auto* self_data = self.data<tensor_t>();
+  auto* index_data = index.data<index_t>();
+  auto* grad_data = grad.data<tensor_t>();
+  auto* out_data = out.data<tensor_t>();
+  auto* x_data = x.data<tensor_t>();
+  auto* value_data = value.data<tensor_t>();
+
+  auto index_dims = index.dims();
+  auto self_dims = self.dims();
+  auto grad_dims = grad.dims();
+
+  int64_t self_size = self.numel();
+  int64_t index_size = index.numel();
+
+  int64_t inner_dim_size = 1;
+  int64_t outer_dim_size = 1;
+  int64_t outer_dim_size_self = 1;
+  int64_t outer_dim_size_grad = 1;
+  int64_t select_dim_size = index_dims[dim];
+  int64_t self_select_dim_size = self_dims[dim];
+  int64_t grad_select_dim_size = grad_dims[dim];
+  for (int i = 0; i < dim; ++i) {
+    inner_dim_size *= index_dims[i];
+  }
+
+  for (int i = dim + 1; i < index_dims.size(); i++) {
+    outer_dim_size *= index_dims[i];
+    outer_dim_size_self *= self_dims[i];
+    outer_dim_size_grad *= grad_dims[i];
+  }
+  int block = 512;
+  int64_t n = inner_dim_size * select_dim_size * outer_dim_size;
+  int64_t grid = (n + block - 1) / block;
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  if (reduce == "mul" || reduce == "multiply") {
+    ScatterMulValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     self_data,
+                                     value_data,
+                                     out_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size);
+  } else if (reduce == "amin" || reduce == "amax") {
+    DenseTensor shared_mem_tensor;
+    shared_mem_tensor.Resize({self_size});
+    ctx.Alloc<int>(&shared_mem_tensor);
+
+    int* shared_mem = shared_mem_tensor.data<int>();
+    ScatterMinMaxValueGradGPUKernel<tensor_t, index_t>
+        <<<grid, block, 0, stream>>>(grad_data,
+                                     dim,
+                                     index_data,
+                                     self_data,
+                                     value_data,
+                                     out_data,
+                                     x_data,
+                                     select_dim_size,
+                                     self_select_dim_size,
+                                     grad_select_dim_size,
+                                     outer_dim_size,
+                                     outer_dim_size_self,
+                                     outer_dim_size_grad,
+                                     index_size,
+                                     self_size,
+                                     include_self,
+                                     shared_mem);
+  }
+}
+
+Instantiate_Template_Function(gpu_gather_kernel)                  // NOLINT
+    Instantiate_Template_Function(gpu_scatter_assign_kernel)      // NOLINT
+    Instantiate_Template_Function(gpu_scatter_add_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_mul_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_min_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_max_kernel)         // NOLINT
+    Instantiate_Template_Function(gpu_scatter_mean_kernel)        // NOLINT
+    Instantiate_Template_Function(gpu_scatter_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function(gpu_scatter_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        gpu_scatter_mul_min_max_input_grad_kernel)                     // NOLINT
+    Instantiate_Template_Function(gpu_scatter_mean_input_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        gpu_scatter_add_mean_value_grad_kernel)  // NOLINT
+    Instantiate_Template_Function_With_Out(
+        gpu_scatter_mul_min_max_value_grad_kernel)  // NOLINT
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h
index 054ccc196fcd00..9fc50c44a79ead 100644
--- a/paddle/phi/kernels/funcs/gather_scatter_functor.h
+++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h
@@ -36,11 +36,46 @@ namespace funcs {
                                     int dim,                           \
                                     const phi::DenseTensor& index,     \
                                     phi::DenseTensor result,           \
+                                    bool include_self,                 \
                                     const phi::DeviceContext& ctx);    \
   template void func<tensor_t, int64_t>(phi::DenseTensor input,        \
                                         int dim,                       \
                                         const phi::DenseTensor& index, \
                                         phi::DenseTensor result,       \
+                                        bool include_self,             \
+                                        const phi::DeviceContext& ctx);
+
+#define Instantiate_Template_Function_With_Out(func)                        \
+  Instantiate_Template_Function_index_t_With_Out(func, int)                 \
+      Instantiate_Template_Function_index_t_With_Out(func, float)           \
+          Instantiate_Template_Function_index_t_With_Out(func, double)      \
+              Instantiate_Template_Function_index_t_With_Out(func, int64_t) \
+                  Instantiate_Template_Function_index_t_With_Out(           \
+                      func, phi::dtype::float16)                            \
+                      Instantiate_Template_Function_index_t_With_Out(       \
+                          func, phi::dtype::bfloat16)                       \
+                          Instantiate_Template_Function_index_t_With_Out(   \
+                              func, unsigned char)
+#define Instantiate_Template_Function_index_t_With_Out(func, tensor_t) \
+  template void func<tensor_t, int>(phi::DenseTensor input,            \
+                                    int dim,                           \
+                                    const phi::DenseTensor& index,     \
+                                    const phi::DenseTensor& out,       \
+                                    const phi::DenseTensor& self,      \
+                                    const phi::DenseTensor& value,     \
+                                    phi::DenseTensor result,           \
+                                    const std::string& reduce,         \
+                                    bool include_self,                 \
+                                    const phi::DeviceContext& ctx);    \
+  template void func<tensor_t, int64_t>(phi::DenseTensor input,        \
+                                        int dim,                       \
+                                        const phi::DenseTensor& index, \
+                                        const phi::DenseTensor& out,   \
+                                        const phi::DenseTensor& self,  \
+                                        const phi::DenseTensor& value, \
+                                        phi::DenseTensor result,       \
+                                        const std::string& reduce,     \
+                                        bool include_self,             \
                                         const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -48,6 +83,7 @@ void cpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -55,6 +91,7 @@ void cpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -62,6 +99,7 @@ void cpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -69,6 +107,31 @@ void cpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -76,20 +139,67 @@ void cpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mean_input_grad_kernel(phi::DenseTensor self,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self,
+                                        const phi::DeviceContext& ctx);
+
 template <typename tensor_t, typename index_t>
 void cpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void cpu_scatter_add_mean_value_grad_kernel(phi::DenseTensor self,
+                                            int dim,
+                                            const phi::DenseTensor& index,
+                                            const phi::DenseTensor& out,
+                                            const phi::DenseTensor& x,
+                                            const phi::DenseTensor& value,
+                                            phi::DenseTensor grad,
+                                            const std::string& reduce,
+                                            bool include_self,
+                                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void cpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
 template <typename tensor_t, typename index_t>
 void gpu_gather_kernel(phi::DenseTensor self,
                        int dim,
                        const phi::DenseTensor& index,
                        phi::DenseTensor result,
+                       bool include_self,
                        const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -97,6 +207,7 @@ void gpu_scatter_assign_kernel(phi::DenseTensor self,
                                int dim,
                                const phi::DenseTensor& index,
                                phi::DenseTensor src,
+                               bool include_self,
                                const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -104,6 +215,7 @@ void gpu_scatter_add_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -111,6 +223,31 @@ void gpu_scatter_mul_kernel(phi::DenseTensor self,
                             int dim,
                             const phi::DenseTensor& index,
                             phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_kernel(phi::DenseTensor self,
+                             int dim,
+                             const phi::DenseTensor& index,
+                             phi::DenseTensor src,
+                             bool include_self,
+                             const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_max_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
+                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_min_kernel(phi::DenseTensor self,
+                            int dim,
+                            const phi::DenseTensor& index,
+                            phi::DenseTensor src,
+                            bool include_self,
                             const phi::DeviceContext& ctx);
 
 template <typename tensor_t, typename index_t>
@@ -118,14 +255,60 @@ void gpu_scatter_input_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_input_grad_kernel(phi::DenseTensor self UNUSED,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mean_input_grad_kernel(phi::DenseTensor self,
+                                        int dim,
+                                        const phi::DenseTensor& index,
+                                        phi::DenseTensor grad,
+                                        bool include_self,
+                                        const phi::DeviceContext& ctx);
+
 template <typename tensor_t, typename index_t>
 void gpu_scatter_value_grad_kernel(phi::DenseTensor self,
                                    int dim,
                                    const phi::DenseTensor& index,
                                    phi::DenseTensor grad,
+                                   bool include_self,
                                    const phi::DeviceContext& ctx);
 
+template <typename tensor_t, typename index_t>
+void gpu_scatter_add_mean_value_grad_kernel(phi::DenseTensor self,
+                                            int dim,
+                                            const phi::DenseTensor& index,
+                                            const phi::DenseTensor& out,
+                                            const phi::DenseTensor& x,
+                                            const phi::DenseTensor& value,
+                                            phi::DenseTensor grad,
+                                            const std::string& reduce,
+                                            bool include_self,
+                                            const phi::DeviceContext& ctx);
+
+template <typename tensor_t, typename index_t>
+void gpu_scatter_mul_min_max_value_grad_kernel(phi::DenseTensor self,
+                                               int dim,
+                                               const phi::DenseTensor& index,
+                                               const phi::DenseTensor& out,
+                                               const phi::DenseTensor& x,
+                                               const phi::DenseTensor& value,
+                                               phi::DenseTensor grad,
+                                               const std::string& reduce,
+                                               bool include_self,
+                                               const phi::DeviceContext& ctx);
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc
index 184016808d8cd0..f0c946134906b0 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cc
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -27,7 +27,7 @@ struct GRUUnitFunctor<phi::CPUContext, T> {
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
@@ -93,7 +93,7 @@ struct GRUUnitGradFunctor<phi::CPUContext, T> {
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     detail::backward_state_grad(
         phi::funcs::detail::backward::gru_stateGrad<T>(),
         value,
@@ -185,7 +185,7 @@ struct GRUUnitFunctorV2<CPUContext, T> {
                       int batch_size,
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     auto blas = phi::funcs::GetBlas<CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(CblasNoTrans,
@@ -239,7 +239,7 @@ struct GRUUnitGradFunctorV2<CPUContext, T> {
                       int batch_size,
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
+#if !defined(__NVCC__) && !defined(__HIPCC___)
     // calculate grad_update_gate, grad_frame_state,
     // grad_reset_output, grad_reset_gate
     detail::cpu_gru_backward(context,
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
index cd0b223d8a0dbc..265febd306f334 100644
--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/index_calculator.h b/paddle/phi/kernels/funcs/index_calculator.h
index 4611173b794a2c..4e306cb87a480d 100644
--- a/paddle/phi/kernels/funcs/index_calculator.h
+++ b/paddle/phi/kernels/funcs/index_calculator.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index c81f716d0658b3..983d33bedc72ca 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -27,15 +27,12 @@
 #include "paddle/phi/kernels/reshape_kernel.h"
 #include "paddle/phi/kernels/split_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #ifdef __NVCC__
 #include <cuda.h>
 #include <cuda_runtime.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
-#elif defined(__MUSACC__)
-#include <musa_runtime.h>
-#include <musa.h>
 #endif
 #endif
 
@@ -76,67 +73,73 @@ std::vector<const phi::DenseTensor*> DealWithBoolIndices(
     const Context& dev_ctx,
     const std::vector<const phi::DenseTensor*>& indices_v,
     std::vector<phi::DenseTensor>* tmp_indices_v) {
-  std::vector<const phi::DenseTensor*> res(indices_v.begin(), indices_v.end());
-  bool contains_bool_tensor = false;
+  std::vector<const phi::DenseTensor*> res;
 
+  bool contains_bool_tensor = false;
   for (size_t i = 0; i < indices_v.size(); ++i) {
     if (indices_v[i]->dtype() == phi::DataType::BOOL) {
       contains_bool_tensor = true;
-      int rank = indices_v[i]->dims().size();
-      PADDLE_ENFORCE_GE(
-          rank,
-          1UL,
-          phi::errors::InvalidArgument("the only bool tensor in indices should "
-                                       "have number of dimension at least 1"));
-      phi::DenseTensor nonzero_indices(phi::DataType::INT64);
-      nonzero_indices.Resize(common::make_ddim({-1, rank}));
-      NonZeroKernel<bool, Context>(dev_ctx, *indices_v[i], &nonzero_indices);
-
-      if (nonzero_indices.numel() == 0) {
-        std::vector<const phi::DenseTensor*> empty_indices;
-        return empty_indices;
-      }
+      break;
+    }
+  }
 
-      std::vector<phi::DenseTensor*> integer_indices(rank, nullptr);
-      const int tmp_ix = tmp_indices_v->size();
-      for (int i = 0; i < rank; ++i) {
-        tmp_indices_v->emplace_back(
-            DenseTensor(phi::DataType::INT64)
-                .Resize(common::make_ddim({nonzero_indices.dims()[0]})));
-      }
-      for (int i = 0; i < rank; ++i) {
-        integer_indices[i] = &((*tmp_indices_v)[i + tmp_ix]);
-      }
-      SplitWithNumKernel<int64_t, Context>(
-          dev_ctx, nonzero_indices, rank, 1, integer_indices);
+  if (contains_bool_tensor) {
+    for (size_t i = 0; i < indices_v.size(); ++i) {
+      if (indices_v[i]->dtype() == phi::DataType::BOOL) {
+        int rank = indices_v[i]->dims().size();
+        PADDLE_ENFORCE_GE(rank,
+                          1UL,
+                          phi::errors::InvalidArgument(
+                              "the only bool tensor in indices should "
+                              "have number of dimension at least 1"));
+        phi::DenseTensor nonzero_indices(phi::DataType::INT64);
+        nonzero_indices.Resize(common::make_ddim({-1, rank}));
+        NonZeroKernel<bool, Context>(dev_ctx, *indices_v[i], &nonzero_indices);
+
+        if (nonzero_indices.numel() == 0) {
+          std::vector<const phi::DenseTensor*> empty_indices;
+          return empty_indices;
+        }
+
+        std::vector<phi::DenseTensor*> integer_indices(rank, nullptr);
+        const int tmp_ix = tmp_indices_v->size();
+        for (int i = 0; i < rank; ++i) {
+          tmp_indices_v->emplace_back(
+              DenseTensor(phi::DataType::INT64)
+                  .Resize(common::make_ddim({nonzero_indices.dims()[0]})));
+        }
+        for (int i = 0; i < rank; ++i) {
+          integer_indices[i] = &((*tmp_indices_v)[i + tmp_ix]);
+        }
+        SplitWithNumKernel<int64_t, Context>(
+            dev_ctx, nonzero_indices, rank, 1, integer_indices);
 #ifdef PADDLE_WITH_XPU
-      auto place = dev_ctx.GetPlace();
-      if (place.GetType() == phi::AllocationType::XPU) {
-        auto& pool = phi::DeviceContextPool::Instance();
-        auto* xpu_ctx = static_cast<phi::XPUContext*>(pool.Get(place));
-        if (xpu_ctx->x_context()->xpu_stream) {
-          dev_ctx.Wait();
+        auto place = dev_ctx.GetPlace();
+        if (place.GetType() == phi::AllocationType::XPU) {
+          auto& pool = phi::DeviceContextPool::Instance();
+          auto* xpu_ctx = static_cast<phi::XPUContext*>(pool.Get(place));
+          if (xpu_ctx->x_context()->xpu_stream) {
+            dev_ctx.Wait();
+          }
         }
-      }
 #endif
 
-    } else if ((indices_v[i]->dtype() == phi::DataType::INT64) ||
-               (indices_v[i]->dtype() == phi::DataType::INT32)) {
-      tmp_indices_v->emplace_back(*indices_v[i]);
-    } else {
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "data type of tensor in indices must be int32, int64 or bool"));
+      } else if ((indices_v[i]->dtype() == phi::DataType::INT64) ||
+                 (indices_v[i]->dtype() == phi::DataType::INT32)) {
+        tmp_indices_v->emplace_back(*indices_v[i]);
+      } else {
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "data type of tensor in indices must be int32, int64 or bool"));
+      }
     }
-  }
-  if (contains_bool_tensor) {
-    std::vector<const phi::DenseTensor*> res_tmp(tmp_indices_v->size(),
-                                                 nullptr);
-    for (size_t i = 0; i < res_tmp.size(); ++i) {
-      res_tmp[i] = &((*tmp_indices_v)[i]);
+
+    res.reserve(tmp_indices_v->size());
+    for (size_t i = 0; i < tmp_indices_v->size(); ++i) {
+      res.emplace_back(&((*tmp_indices_v)[i]));
     }
-    res.swap(res_tmp);
+  } else {
+    res = indices_v;
   }
-
   return res;
 }
 
@@ -215,62 +218,50 @@ void DealWithIndices(const Context& dev_ctx,
     res_dim_v->insert(res_dim_v->end(),
                       tmp_x_dims.begin() + int_indices_v.size(),
                       tmp_x_dims.end());
-
-    std::vector<DenseTensor> reshaped_indices_v;
+    phi::DDim res_dim = common::make_ddim(*res_dim_v);
     for (size_t i = 0; i < int_indices_v.size(); ++i) {
+      phi::DenseTensor index_tensor;
       if (int_indices_v[i]->dtype() == phi::DataType::INT32) {
-        reshaped_indices_v.emplace_back(phi::Cast<int, Context>(
-            dev_ctx, *int_indices_v[i], phi::DataType::INT64));
+        index_tensor = phi::Cast<int, Context>(
+            dev_ctx, *int_indices_v[i], phi::DataType::INT64);
       } else {
-        reshaped_indices_v.emplace_back(*int_indices_v[i]);
+        index_tensor = *int_indices_v[i];
       }
+      tmp_res_indices_v->emplace_back(
+          GetReshapeAndExpandTensor<int64_t, Context>(
+              dev_ctx, index_tensor, res_dim, bd_dim, 0));
     }
-    reshaped_indices_v.insert(
-        reshaped_indices_v.end(), range_tensor_v.begin(), range_tensor_v.end());
-
-    phi::DDim res_dim = common::make_ddim(*res_dim_v);
-
-    for (size_t i = 0; i < reshaped_indices_v.size(); ++i) {
+    for (size_t i = 0; i < range_tensor_v.size(); ++i) {
       tmp_res_indices_v->emplace_back(
           GetReshapeAndExpandTensor<int64_t, Context>(
-              dev_ctx,
-              reshaped_indices_v[i],
-              res_dim,
-              bd_dim,
-              ((i < int_indices_v.size())
-                   ? 0
-                   : i - int_indices_v.size() + len_bd_dim)));
+              dev_ctx, range_tensor_v[i], res_dim, bd_dim, i + len_bd_dim));
     }
     for (size_t i = 0; i < res_indices_v->size(); ++i) {
       (*res_indices_v)[i] = &(*tmp_res_indices_v)[i];
     }
 
   } else {
-    std::vector<DenseTensor> int_indices_v_tmp;
-
     for (size_t i = 0; i < int_indices_v.size(); ++i) {
+      phi::DenseTensor index_tensor;
+      phi::DenseTensor expand_index;
       if (int_indices_v[i]->dtype() == phi::DataType::INT32) {
-        int_indices_v_tmp.emplace_back(phi::Cast<int, Context>(
-            dev_ctx, *int_indices_v[i], phi::DataType::INT64));
+        index_tensor = phi::Cast<int, Context>(
+            dev_ctx, *int_indices_v[i], phi::DataType::INT64);
       } else {
-        int_indices_v_tmp.emplace_back(*int_indices_v[i]);
+        index_tensor = *int_indices_v[i];
       }
-    }
-
-    for (size_t i = 0; i < int_indices_v.size(); ++i) {
       if (bd_dim != int_indices_v[i]->dims()) {
-        tmp_res_indices_v->emplace_back(
-            DenseTensor(phi::DataType::INT64).Resize(bd_dim));
+        expand_index = DenseTensor(phi::DataType::INT64).Resize(bd_dim);
         ExpandKernel<int64_t, Context>(
             dev_ctx,
-            int_indices_v_tmp[i],
+            index_tensor,
             IntArray(common::vectorize<int64_t>(bd_dim)),
-            &(*tmp_res_indices_v)[i]);
+            &expand_index);
       } else {
-        tmp_res_indices_v->emplace_back(int_indices_v_tmp[i]);
+        expand_index = index_tensor;
       }
+      tmp_res_indices_v->emplace_back(expand_index);
     }
-
     for (size_t i = 0; i < res_indices_v->size(); ++i) {
       (*res_indices_v)[i] = &(*tmp_res_indices_v)[i];
     }
@@ -310,7 +301,7 @@ static void CalCompressedDimsWith1AndWithout1(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 __global__ void range_cuda_kernel(int64_t N, T* out) {
   int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index 8dafc186d746e8..bbfc54e5e2dc03 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -19,7 +19,7 @@
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
 #endif
 
@@ -153,7 +153,7 @@ inline std::vector<T> get_new_data_from_tensor(
   return vec_new_data;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 using phi::kps::details::FastDivMod;
 
 struct FastDivModForInterpolate {
diff --git a/paddle/phi/kernels/funcs/isfinite_functor.h b/paddle/phi/kernels/funcs/isfinite_functor.h
index 998fe4349999f0..d10e7998ba8067 100644
--- a/paddle/phi/kernels/funcs/isfinite_functor.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -20,7 +20,7 @@ namespace funcs {
 template <typename T, class Enable = void>
 struct IsNanFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     return ::isnan(a);
 #else
     return std::isnan(a);
@@ -55,7 +55,7 @@ struct IsNanFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsInfFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     return ::isinf(a);
 #else
     return std::isinf(a);
@@ -86,7 +86,7 @@ struct IsInfFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsFiniteFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)
     return ::isfinite(a);
 #else
     return std::isfinite(a);
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index e25c43b719c1cf..6a82875819161b 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
index fabcf25db4bd63..d971f1b7a264a0 100644
--- a/paddle/phi/kernels/funcs/layer_norm_util.h
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -36,7 +36,7 @@ struct RowwiseMean2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 class RowwiseMean2D<phi::GPUContext, T> {
  public:
@@ -93,7 +93,7 @@ struct ColwiseSum2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 class ColwiseSum2D<phi::GPUContext, T> {
  public:
diff --git a/paddle/phi/kernels/funcs/load_store_util.h b/paddle/phi/kernels/funcs/load_store_util.h
index 28c033d290b23f..848f7d0b40bd8f 100644
--- a/paddle/phi/kernels/funcs/load_store_util.h
+++ b/paddle/phi/kernels/funcs/load_store_util.h
@@ -20,7 +20,7 @@
 namespace phi {
 namespace funcs {
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 template <typename T>
 __device__ __inline__ T ClipFunc(const T v, const T min, const T max) {
   if (v > max) return max;
diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
index 7c2950fc9d3b85..1a6cca7f11aaed 100644
--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
+++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -17,11 +17,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_fp16.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa_fp16.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
@@ -191,7 +186,7 @@ typedef unsigned warp_mask_t;
 template <typename T>
 __inline__ __device__ T WarpReduceSum(T val, warp_mask_t lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
     val += __shfl_xor_sync(lane_mask, val, mask, warpSize);
 #else
     val += __shfl_xor(val, mask, warpSize);
@@ -264,7 +259,7 @@ __inline__ __device__ T BlockReduceSumV2(T *val) {
 template <typename T>
 __inline__ __device__ T WarpReduceMax(T val, warp_mask_t lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
     val = max(val, __shfl_xor_sync(lane_mask, val, mask, warpSize));
 #else
     val = max(val, __shfl_xor(val, mask, warpSize));
@@ -287,7 +282,7 @@ __inline__ __device__ T WarpReduceMaxV2(T *val) {
 template <typename T>
 __inline__ __device__ T WarpReduceMin(T val, warp_mask_t lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
     val = min(val, __shfl_xor_sync(lane_mask, val, mask, warpSize));
 #else
     val = min(val, __shfl_xor(val, mask, warpSize));
@@ -299,7 +294,7 @@ __inline__ __device__ T WarpReduceMin(T val, warp_mask_t lane_mask) {
  * threads are less than warpSize.*/
 template <typename T>
 __inline__ __device__ T PartialWarpReduceMin(T val, warp_mask_t lane_mask) {
-#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000))  || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
   T warp_val = __shfl_sync(lane_mask, val, 0, warpSize);
 #else
   T warp_val = __shfl(
@@ -308,7 +303,7 @@ __inline__ __device__ T PartialWarpReduceMin(T val, warp_mask_t lane_mask) {
   warp_val = val;
 
   for (int offset = HALF_WARP; offset > 0; offset >>= 1)
-#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
     warp_val =
         min(warp_val, __shfl_down_sync(lane_mask, warp_val, offset, warpSize));
 #else
@@ -408,7 +403,7 @@ __inline__ __device__ T PartialBlockReduceMin(T val, warp_mask_t mask) {
   __syncwarp();
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
+#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
   val = __shfl_sync(mask, shared[lane], 0, warpSize);
 #else
   val = __shfl(shared[lane], 0, warpSize);
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 285758655b17d1..84df3c4b139aa4 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -242,7 +242,7 @@ void set_constant(const phi::DeviceContext& context,
     return;
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // tensor->place().apply_visitor(func);
   phi::VisitPlace(tensor->place(), func);
 #elif defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 17936e98393c4c..5e5834bf91e307 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T>
 void BatchTranspose(T* output,
                     const T* input,
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
index f87344ab2c72fe..c0ea7ad84c41b1 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -24,7 +24,7 @@ template <typename Context, typename T>
 void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
                                                   const DenseTensor& a,
                                                   DenseTensor* a_inv) {
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
   const auto& mat_dims = a.dims();
   const int rank = mat_dims.size();
   int n = mat_dims[rank - 1];
diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu
index 51733b91e29b74..0bd1522e9f58ee 100644
--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -26,7 +26,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
                                                 const DenseTensor& a,
                                                 const DenseTensor& b,
                                                 DenseTensor* out) {
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
   // solve the equation: Ax = B,
   // use cuBlas cublas<S/D>getrfBatched funcion to performs the LU
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
index 810333ac1d320d..d6cf68c092317e 100644
--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 static void GetModebySort(const phi::GPUContext& dev_ctx,
                           const DenseTensor* input_tensor,
diff --git a/paddle/phi/kernels/funcs/mufft_util.h b/paddle/phi/kernels/funcs/mufft_util.h
deleted file mode 100644
index c33890e3e3ac45..00000000000000
--- a/paddle/phi/kernels/funcs/mufft_util.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/phi/backends/dynload/mufft.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/kernels/funcs/fft.h"
-#include "paddle/phi/kernels/funcs/fft_key.h"
-
-namespace phi {
-namespace funcs {
-namespace detail {
-
-// An RAII encapsulation of muFFTHandle
-class MUFFTHandle {
- public:
-  MUFFTHandle() {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mufftCreate(&handle_));
-  }
-
-  MUFFTHandle(const MUFFTHandle& other) = delete;
-  MUFFTHandle& operator=(const MUFFTHandle& other) = delete;
-
-  MUFFTHandle(MUFFTHandle&& other) = delete;
-  MUFFTHandle& operator=(MUFFTHandle&& other) = delete;
-
-  ::mufftHandle& get() { return handle_; }
-  const ::mufftHandle& get() const { return handle_; }
-
-  ~MUFFTHandle() { phi::dynload::mufftDestroy(handle_); }
-
- private:
-  ::mufftHandle handle_;
-};
-
-class FFTConfig {
- public:
-  using plan_size_type = int;
-  explicit FFTConfig(const FFTConfigKey& key)
-      : FFTConfig(
-            std::vector<int64_t>(key.sizes_, key.sizes_ + key.signal_ndim_ + 1),
-            key.fft_type_,
-            key.value_type_) {}
-  FFTConfig(const std::vector<int64_t>& sizes,
-            FFTTransformType fft_type,
-            DataType precision)
-      : fft_type_(fft_type), precision_(precision) {
-    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
-    const auto batch_size = static_cast<plan_size_type>(sizes[0]);
-    const int signal_ndim = sizes.size() - 1;
-
-    mufftType exec_type = [&]() {
-      if (precision == DataType::FLOAT32) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return MUFFT_C2C;
-          case FFTTransformType::R2C:
-            return MUFFT_R2C;
-          case FFTTransformType::C2R:
-            return MUFFT_C2R;
-        }
-      } else if (precision == DataType::FLOAT64) {
-        switch (fft_type) {
-          case FFTTransformType::C2C:
-            return MUFFT_Z2Z;
-          case FFTTransformType::R2C:
-            return MUFFT_D2Z;
-          case FFTTransformType::C2R:
-            return MUFFT_Z2D;
-        }
-      }
-      PADDLE_THROW(phi::errors::InvalidArgument(
-          "Only transforms of type float32 and float64 are supported."));
-    }();
-
-    // disable auto allocation of workspace to use allocator from the framework
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mufftMakePlanMany(plan(),
-                                         signal_ndim,
-                                         signal_sizes.data(),
-                                         /* inembed */ nullptr,
-                                         /* base_istride */ 1,
-                                         /* idist */ 1,
-                                         /* onembed */ nullptr,
-                                         /* base_ostride */ 1,
-                                         /* odist */ 1,
-                                         exec_type,
-                                         batch_size,
-                                         &ws_size_));
-  }
-
-  const mufftHandle& plan() const { return plan_.get(); }
-  FFTTransformType transform_type() const { return fft_type_; }
-  DataType data_type() const { return precision_; }
-  size_t workspace_size() const { return ws_size_; }
-
- private:
-  MUFFTHandle plan_;
-  size_t ws_size_;  // workspace size in bytes
-  FFTTransformType fft_type_;
-  DataType precision_;
-};
-
-static void exec_plan(const FFTConfig& config,
-                      void* in_data,
-                      void* out_data,
-                      bool forward) {
-  auto& plan = config.plan();
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mufftXtExec(
-      plan, in_data, out_data, forward ? MUFFT_FORWARD : MUFFT_INVERSE));
-}
-
-}  // namespace detail
-}  // namespace funcs
-}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
index 94b8ec430f34f9..50d7c39b198a56 100644
--- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
@@ -18,12 +18,6 @@
 
 #include <cub/cub.cuh>  // NOLINT
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 
@@ -575,7 +569,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &context,
             FINAL_MASK);
       } else {
         if (bias_is_mask) {
-#if defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
           PADDLE_ENFORCE_EQ(bias_is_mask,
                             false,
                             phi::errors::InvalidArgument(
@@ -617,7 +611,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &context,
             FINAL_MASK);
       } else {
         if (bias_is_mask) {
-#if defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
           PADDLE_ENFORCE_EQ(bias_is_mask,
                             false,
                             phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
index e606b2c471dc62..0d8fa486cc065a 100644
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cfloat>
 #include <string>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index 9b6be38a76dc70..1ffd747735543c 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #endif
 
@@ -115,7 +115,7 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename PoolProcess, typename T>
 class Pool2dDirectCUDAFunctor {
  public:
@@ -211,7 +211,7 @@ class MaxPool2dGradFunctor {
                   DenseTensor* input_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename PoolProcess, typename T>
 class Pool3dDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index f0fe802044d686..564c02c9f9f79b 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -23,7 +23,7 @@
 #include <set>
 #include <vector>
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 
@@ -67,7 +67,7 @@ using dim3 = phi::kps::dim3;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 namespace details {
 
 // Check if reduce rand is valid
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index bfd30fcd43069d..e6ecb9819e5054 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -21,7 +21,7 @@
 namespace phi {
 namespace funcs {
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 543ba96c7d5604..1afcad9f0f918c 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -15,8 +15,8 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-#if defined(__NVCC__) || defined(__MUSACC__)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/skip_layernorm_functor.cu b/paddle/phi/kernels/funcs/skip_layernorm_functor.cu
index ae0aa059bc443c..80055b29f67117 100644
--- a/paddle/phi/kernels/funcs/skip_layernorm_functor.cu
+++ b/paddle/phi/kernels/funcs/skip_layernorm_functor.cu
@@ -81,7 +81,7 @@ __global__ void SkipLayerNormKernel(int num,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifdef __MUSACC__  // @{ Half kernel: SkipLayerNormKernel
+#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormKernel
 template <>
 __global__ void SkipLayerNormKernel<half, 256>(int num,
                                                int hidden,
@@ -169,7 +169,7 @@ __global__ void SkipLayerNormKernel2(int num,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifdef __MUSACC__  // @{ Half kernel: SkipLayerNormKernel2
+#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormKernel2
 template <>
 __global__ void SkipLayerNormKernel2<half, half2, 256>(int num,
                                                        int hidden,
@@ -256,7 +256,7 @@ __global__ void SkipLayerNormSmallKernel(int num,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifdef __MUSACC__  // @{ Half kernel: SkipLayerNormSmallKernel
+#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormSmallKernel
 template <>
 __global__ void SkipLayerNormSmallKernel<half, 32>(int num,
                                                    int hidden,
@@ -377,7 +377,7 @@ void SkipLayerNormFunctor<T>::operator()(const int num,
                 reinterpret_cast<const float2 *>(bias),
                 eps);
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifdef __MUSACC__
+#ifndef __HIPCC__
       } else if (std::is_same<T, __half>::value) {
         SkipLayerNormKernel2<__half, __half2, threads>
             <<<block, threads, 0, stream>>>(
diff --git a/paddle/phi/kernels/funcs/skip_layernorm_functor.h b/paddle/phi/kernels/funcs/skip_layernorm_functor.h
index ea64d477e11308..65b32f7c6b690f 100644
--- a/paddle/phi/kernels/funcs/skip_layernorm_functor.h
+++ b/paddle/phi/kernels/funcs/skip_layernorm_functor.h
@@ -14,12 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <musa_runtime.h>
-#include <cub/cub.cuh>  // NOLINT
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index 796a5187659044..c7dfd0c0978c00 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -60,21 +60,6 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
                                             context.template Alloc<T>(Y),
                                             MIOPEN_SOFTMAX_ACCURATE,
                                             MIOPEN_SOFTMAX_MODE_INSTANCE));
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t cudnn_x_desc =
-      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  mudnnTensorDescriptor_t cudnn_y_desc =
-      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnSoftmaxForward(context.cudnn_handle(),
-                                            CudnnDataType<T>::kOne(),
-                                            cudnn_x_desc,
-                                            X->data<T>(),
-                                            CudnnDataType<T>::kZero(),
-                                            cudnn_y_desc,
-                                            context.template Alloc<T>(Y),
-                                            MIOPEN_SOFTMAX_ACCURATE,
-                                            MIOPEN_SOFTMAX_MODE_INSTANCE));                                       
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -132,25 +117,6 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
                                              context.template Alloc<T>(XGrad),
                                              MIOPEN_SOFTMAX_ACCURATE,
                                              MIOPEN_SOFTMAX_MODE_INSTANCE));
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t cudnn_y_desc =
-      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  mudnnTensorDescriptor_t cudnn_xgrad_desc =
-      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  mudnnTensorDescriptor_t cudnn_ygrad_desc =
-      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_GPU_SUCCESS(
-      phi::dynload::mudnnSoftmaxBackward(context.cudnn_handle(),
-                                             CudnnDataType<T>::kOne(),
-                                             cudnn_y_desc,
-                                             Y->data<T>(),
-                                             cudnn_ygrad_desc,
-                                             YGrad->data<T>(),
-                                             CudnnDataType<T>::kZero(),
-                                             cudnn_xgrad_desc,
-                                             context.template Alloc<T>(XGrad),
-                                             MIOPEN_SOFTMAX_ACCURATE,
-                                             MIOPEN_SOFTMAX_MODE_INSTANCE));                                      
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -183,7 +149,7 @@ template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
 #endif
 
 // MIOPEN do not support double
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif
diff --git a/paddle/phi/kernels/funcs/softmax.h b/paddle/phi/kernels/funcs/softmax.h
index 1198b80a9e879e..80805eb6d76f65 100644
--- a/paddle/phi/kernels/funcs/softmax.h
+++ b/paddle/phi/kernels/funcs/softmax.h
@@ -37,7 +37,7 @@ class SoftmaxGradFunctor {
                   phi::DenseTensor* x_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T, typename DeviceContext>
 class SoftmaxCUDNNFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/sparse/softmax.cu.h b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
index 07b4e69b5dafa6..b75f870970a314 100644
--- a/paddle/phi/kernels/funcs/sparse/softmax.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
@@ -27,8 +27,6 @@ inline DenseTensor GetOffsets(const Context& dev_ctx,
                               const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
-#elif defined(__MUSACC__)
-  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
@@ -89,8 +87,6 @@ std::tuple<DenseTensor, DenseTensor, DenseTensor, DenseTensor> ComputePoolMax(
     const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
-#elif defined(__MUSACC__)
-  const auto& policy = thrust::musa::par.on(dev_ctx.stream());  
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
index 09236974dc296f..f6d67488d1f488 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -100,7 +100,3 @@ inline SparseBlasT<DeviceContext, T> GetSparseBlas(
 #if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h"
 #endif
-
-#if defined(PADDLE_WITH_MUSA)
-#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h"
-#endif
\ No newline at end of file
diff --git a/paddle/phi/kernels/funcs/squared_l2_norm.h b/paddle/phi/kernels/funcs/squared_l2_norm.h
index 08e518fa7c6e0b..c77552822bbfb7 100644
--- a/paddle/phi/kernels/funcs/squared_l2_norm.h
+++ b/paddle/phi/kernels/funcs/squared_l2_norm.h
@@ -18,9 +18,9 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
@@ -54,7 +54,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T1, typename T2 = T1>
 void SquaredL2Norm(const phi::GPUContext& ctx,
                    const T1* x,
diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
index b2b6598c4b2704..b91ab85c55b33c 100644
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -56,7 +56,7 @@ inline void CopyWithContext(const Context& ctx,
                             const Place& src_place,
                             const void* src,
                             size_t num) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
   memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
 #else
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index 44f19988ad3b05..31502804f7f4e1 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include <cstdio>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -1126,15 +1126,6 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << hipGetErrorString(err);
       return false;
     }
-#elif defined(__MUSACC__)
-    if (err != musaSuccess) {
-      LOG(ERROR) << "TopKOP failed as could not launch "
-                    "musacub::DeviceSegmentedRadixSort::SortPairsDescending to "
-                    "calculate "
-                    "temp_storage_bytes, status: "
-                 << musaGetErrorString(err);
-      return false;
-    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR)
@@ -1160,12 +1151,12 @@ bool SortTopk(const phi::GPUContext& ctx,
                                                  0,
                                                  sizeof(T) * 8,
                                                  cu_stream);
-#ifdef __MUSACC__
-    if (err != musaSuccess) {
+#ifdef __HIPCC__
+    if (err != hipSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
                     "hipcub::DeviceSegmentedRadixSort::SortPairs to calculate "
                     "temp_storage_bytes, status: "
-                 << musaGetErrorString(err);
+                 << hipGetErrorString(err);
       return false;
     }
 #else
@@ -1196,14 +1187,14 @@ bool SortTopk(const phi::GPUContext& ctx,
         0,
         sizeof(T) * 8,
         cu_stream);
-#ifdef __MUSACC__
-    if (err != musaSuccess) {
+#ifdef __HIPCC__
+    if (err != hipSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
                     "hipcub::DeviceSegmentedRadixSort::SortPairsDescending to "
                     "sort input, "
                     "temp_storage_bytes: "
                  << temp_storage_bytes
-                 << ", status: " << musaGetErrorString(err);
+                 << ", status: " << hipGetErrorString(err);
       return false;
     }
 #else
@@ -1232,14 +1223,14 @@ bool SortTopk(const phi::GPUContext& ctx,
                                                  0,
                                                  sizeof(T) * 8,
                                                  cu_stream);
-#ifdef __MUSACC__
-    if (err != musaSuccess) {
+#ifdef __HIPCC__
+    if (err != hipSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
                     "hipcub::DeviceSegmentedRadixSort::SortPairs to "
                     "sort input, "
                     "temp_storage_bytes: "
                  << temp_storage_bytes
-                 << ", status: " << musaGetErrorString(err);
+                 << ", status: " << hipGetErrorString(err);
       return false;
     }
 #else
diff --git a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
index 614f930af964c3..ef803f0ea5f3dc 100644
--- a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
@@ -173,4 +173,3 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act,
                    phi::fusion::cutlass_internal::FusedConv2dAddActKernel,
                    float,
                    phi::dtype::float16) {}
-
diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h
index 22f2a688949ea6..73be0901c6f36e 100644
--- a/paddle/phi/kernels/fusion/gpu/block_attn.h
+++ b/paddle/phi/kernels/fusion/gpu/block_attn.h
@@ -18,6 +18,7 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/quant_dequant.h"
 #include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
+
 namespace phi {
 namespace fusion {
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
index fb5da0a33d3670..2629ee4fdd6b99 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
@@ -21,7 +21,7 @@ PHI_DECLARE_bool(use_fast_math);
 namespace phi {
 namespace fusion {
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 template <typename T,
           typename Functor,
           int VecSize,
@@ -448,7 +448,7 @@ void FusedBiasActKernel(const Context &dev_ctx,
                         float quant_max_bound,
                         float quant_min_bound,
                         DenseTensor *out) {
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
   int rows = x.dims()[0];
   int cols = x.dims()[1];
   if (x.dtype() == phi::DataType::INT32) {
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
index 848b34837f8e39..96d159e091f140 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
@@ -23,7 +23,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/funcs/load_store_util.h"
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
 #endif
@@ -33,7 +33,7 @@
 namespace phi {
 namespace fusion {
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 template <typename T>
 struct GeluComputeType;
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
index aebe45dcdb7285..0f93e21553a74b 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include <cuda_fp16.h>
 #include <cub/cub.cuh>
 #endif
@@ -21,7 +21,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
 #endif
 
@@ -51,7 +51,7 @@ void FusedBiasDropoutResidualLnGradKernel(
     DenseTensor* bias_grad,
     DenseTensor* ln_scale_grad,
     DenseTensor* ln_bias_grad) {
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
   using U = LayerNormParamType<T>;
   auto* d_y_data = y_grad.data<T>();
   auto* ln_scale_data =
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
index 746023b88bc6d1..fd1f754cc9827a 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
 #endif
 
@@ -42,7 +42,7 @@ void FusedBiasDropoutResidualLnKernel(
     DenseTensor* dropout_mask_out,
     DenseTensor* ln_mean,
     DenseTensor* ln_variance) {
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
   using U = phi::funcs::LayerNormParamType<T>;
   auto* x_data = x.data<T>();
   auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
index 5a93d8e4fd565b..e795d37ea490e1 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
index 7ae4b29af0df53..700141f1e03318 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
index 69c2fb2683978c..894903fb0fab83 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
index 04b629db9e201f..52152476e4aca1 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 24765777a96eaa..4d06cc27a34e34 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -108,10 +108,6 @@ __global__ void VectorizedDropoutBackward(
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
-#elif defined(PADDLE_WITH_MUSA)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, idx + THREAD_ID_X, increment, &state);
-  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -206,7 +202,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                        ? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
                        : NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
     VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
         <<<grid_size, block_size, 0, stream>>>(0,
                                                numel,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index 1229806e172477..e4effaf6be28c4 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -94,10 +94,6 @@ __global__ void VectorizedDropoutForward(
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
-#elif defined(PADDLE_WITH_MUSA)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, idx + THREAD_ID_X, increment, &state);
-  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -190,7 +186,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     auto dst_functor =
         NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
     VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
         <<<grid_size, block_size, 0, stream>>>(0,
                                                numel,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
index 98ac36321c1c84..2d3b2938a09a07 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
@@ -23,8 +23,8 @@
 namespace cub = hipcub;
 #endif
 
-#if defined(__MUSACC__)
-#include <musa_fp16.h>
+#if defined(PADDLE_WITH_CUDA)
+#include <cuda_fp16.h>
 #endif
 
 #include "paddle/common/errors.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
index b162e725647f51..e31b24e7e105e5 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -33,13 +33,12 @@ limitations under the License.
 // https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/layer_norm.cuh
 // The following code modified from OneFlow's implementation, and change to use
 // single Pass algorithm. Support Int8 quant, dequant Load/Store implementation.
-#include "paddle/phi/core/enforce.h"
 
 #include <assert.h>
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include <cub/cub.cuh>
 #include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
@@ -51,7 +50,7 @@ namespace fusion {
 
 namespace {
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
 constexpr int kWarpSize = 32;
 
@@ -941,8 +940,8 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                           DenseTensor* residual_out,
                           DenseTensor* mean,
                           DenseTensor* variance) {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-        PADDLE_ENFORCE(false, "please compile with cuda!");
+#if defined(PADDLE_WITH_HIP)
+  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
 #else
   using U = phi::funcs::LayerNormParamType<T>;
   const T* x_data = x.data<T>();
@@ -1067,7 +1066,7 @@ void FusedLayerNormKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
                    GPU,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
index 972f5ee633bbb0..0db16ffb7e20bc 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_rope_utils.h
@@ -125,10 +125,18 @@ __global__ void VectorizedFusedRopeWithRotateEveryTwoKernel(
         MPType p0 = static_cast<MPType>(input[pr_index]);
         MPType p1 = static_cast<MPType>(input[ls_index]);
 
-        result[pr_index] =
-            cos_value[pr_index] * p0 - sign * sin_value[ls_index] * p1;
-        result[ls_index] =
-            cos_value[ls_index] * p1 + sign * sin_value[pr_index] * p0;
+        if (sign == 1) {
+          result[pr_index] = cos_value[pr_index] * p0;
+          result[pr_index] -= sin_value[pr_index] * p1;
+
+          result[ls_index] = sin_value[ls_index] * p0;
+          result[ls_index] += cos_value[ls_index] * p1;
+        } else if (sign == -1) {
+          result[pr_index] =
+              cos_value[pr_index] * p0 + sin_value[ls_index] * p1;
+          result[ls_index] =
+              cos_value[ls_index] * p1 - sin_value[pr_index] * p0;
+        }
 
         store[pr_index] = static_cast<T>(result[pr_index]);
         store[ls_index] = static_cast<T>(result[ls_index]);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h
index ed93b2a7ba2d63..32dc8aa07dec41 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h
@@ -18,12 +18,6 @@
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
-
-#ifdef PADDLE_WITH_CUDA
-#include <muda.h>
-#include <murand_kernel.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
@@ -100,7 +94,7 @@ __device__ __forceinline__ T warp_shfl_xor_upper_tri(T value,
                                                      int laneMask,
                                                      int width,
                                                      unsigned int mask = MASK) {
-#if CUDA_VERSION >= 9000 || defined(__MUSACC__)
+#if CUDA_VERSION >= 9000
   return __shfl_xor_sync(mask, value, laneMask, width);
 #else
   return __shfl_xor(value, laneMask, width);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
index 72389836f0a623..c1d60cbffee2fa 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -18,12 +18,6 @@
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa.h>
-#include <murand_kernel.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
@@ -31,7 +25,7 @@
 
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
@@ -77,7 +71,7 @@ struct MaxOP {
 template <typename T>
 __device__ __forceinline__ T
 warp_shfl_xor(T value, int laneMask, int width, unsigned int mask = MASK) {
-#if CUDA_VERSION >= 9000 || defined(__MUSACC__)
+#if CUDA_VERSION >= 9000
   return __shfl_xor_sync(mask, value, laneMask, width);
 #else
   return __shfl_xor(value, laneMask, width);
diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
index 33d97b0b1f6655..e9b2b8eb0cbe6f 100644
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
@@ -20,7 +20,7 @@
 namespace phi {
 namespace fusion {
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
 constexpr unsigned int str2int(const char *str, int h = 0) {
   return !str[h] ? 5381 : (str2int(str, h + 1) * 33) ^ str[h];
@@ -1044,7 +1044,7 @@ void MMHAKernel(const Context &dev_ctx,
                 DenseTensor *out,
                 DenseTensor *cache_kv_out,
                 DenseTensor *beam_cache_offset_out) {
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
   if (x.dtype() == phi::DataType::INT32) {
     switch (str2int(compute_dtype.c_str())) {
       case str2int("fp16"):
diff --git a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
index 586b2cf626fd8e..7d4e0c81198b12 100644
--- a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
@@ -47,7 +47,7 @@
     \brief Functor used by mmha kernel.
 */
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #pragma once
 
 #if defined(__CUDACC__) && CUDA_VERSION >= 11000
diff --git a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
index dd60ca310d8b35..c970f50eb117ad 100644
--- a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
@@ -329,8 +329,6 @@ void MultiheadMatmulKernel(const Context &dev_ctx,
         &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
 #ifdef PADDLE_WITH_HIP
     hipMemset(temp_qk_bias, 0, sizeof(float) * size);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #else
     cudaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #endif
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 1c2b539d470c0c..2a1c6759bbc8ba 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -234,7 +234,10 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh,
                                                t_min,
                                                t_max);
 
-
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
+                                               CudaSTanhGradFunctor,
+                                               scale_a,
+                                               scale_b);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
                                                CudaSoftplusGradFunctor,
@@ -376,12 +379,7 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                    ThresholdedReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-//TODO:MCC COMPILE ERROR
-// PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
-// DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
-//                                                CudaSTanhGradFunctor,
-//                                                scale_a,
-//                                                scale_b);
+PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad,
                                                 SoftplusGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index a86ba904999c08..34bbbfbd11859e 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -89,7 +89,7 @@ void ActivationGPUImpl(const Context& dev_ctx,
   }
 
 DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
-// DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor)
@@ -100,7 +100,7 @@ DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
-// DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Reciprocal, CudaReciprocalFunctor)
@@ -138,7 +138,7 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh,
                                      CudaHardTanhFunctor,
                                      t_min,
                                      t_max)
-// DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
                                      CudaSoftplusFunctor,
                                      beta,
@@ -231,7 +231,7 @@ PD_REGISTER_KERNEL(relu,
 
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sin, SinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cos, CosKernel)
-// PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tan, TanKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tan, TanKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acos, AcosKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asin, AsinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atan, AtanKernel)
@@ -240,13 +240,13 @@ PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cosh, CoshKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asinh, AsinhKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acosh, AcoshKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atanh, AtanhKernel)
-// PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tanh, TanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hardtanh, HardTanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-// PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
diff --git a/paddle/phi/kernels/gpu/all_gather_kernel.cu b/paddle/phi/kernels/gpu/all_gather_kernel.cu
index 563733243c4913..ca6bfd7b4517be 100644
--- a/paddle/phi/kernels/gpu/all_gather_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_gather_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -28,7 +28,7 @@ void AllGatherKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      int nranks,
                      DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   auto out_dims = x.dims();
   out_dims[0] *= nranks;
   out->Resize(out_dims);
diff --git a/paddle/phi/kernels/gpu/all_reduce_kernel.cu b/paddle/phi/kernels/gpu/all_reduce_kernel.cu
index ba0e4ee8a90e00..0c920ef1bc61e8 100644
--- a/paddle/phi/kernels/gpu/all_reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_reduce_kernel.cu
@@ -15,10 +15,9 @@
 #include "paddle/phi/kernels/all_reduce_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -29,7 +28,7 @@ void AllReduceKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      int reduce_type,
                      DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   out->Resize(x.dims());
   dev_ctx.template Alloc<T>(out);
 
@@ -44,27 +43,25 @@ void AllReduceKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_NOT_NULL(stream,
                           errors::NotFound("Should initialize NCCL firstly."));
 
-  mcclRedOp_t red_type = mcclSum;
+  ncclRedOp_t red_type = ncclSum;
   switch (static_cast<ReduceType>(reduce_type)) {
     case ReduceType::kRedSum:
-      red_type = mcclSum;
+      red_type = ncclSum;
       break;
     case ReduceType::kRedMax:
-      red_type = mcclMax;
+      red_type = ncclMax;
       break;
     case ReduceType::kRedMin:
-      red_type = mcclMin;
+      red_type = ncclMin;
       break;
     case ReduceType::kRedProd:
-      red_type = mcclProd;
+      red_type = ncclProd;
       break;
     case ReduceType::kRedAll:
-      // NOTE(zhonghui): There is no reduce_all type of mcclRedOp_t, just use
+      // NOTE(zhonghui): There is no reduce_all type of ncclRedOp_t, just use
       // min to replace
-      red_type = mcclMin;
+      red_type = ncclMin;
       break;
-    default:
-    PADDLE_ENFORCE(false, "unsupported type");
   }
   comm_ctx->AllReduce(out, x, red_type, stream);
 #else
diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
index e5322c0e48f8c1..6d50e2ceb1ae50 100644
--- a/paddle/phi/kernels/gpu/all_to_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/distributed/utils.h"
 #endif
@@ -29,7 +29,8 @@ template <typename T, typename Context>
 void AllToAllKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if NCCL_VERSION_CODE >= 2703
   auto x_dims = x.dims();
   out->Resize(x_dims);
   dev_ctx.template Alloc<T>(out);
@@ -71,6 +72,10 @@ void AllToAllKernel(const Context& dev_ctx,
     offset += send_numel;
   }
   comm_ctx->GroupEnd();
+#else
+  PADDLE_THROW(
+      platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+#endif
 #else
   PADDLE_THROW(
       errors::PreconditionNotMet("PaddlePaddle should compile with GPU."));
@@ -79,7 +84,7 @@ void AllToAllKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
 PD_REGISTER_KERNEL(all_to_all,
                    GPU,
                    ALL_LAYOUT,
@@ -94,18 +99,18 @@ PD_REGISTER_KERNEL(all_to_all,
                    bool,
                    phi::dtype::bfloat16,
                    phi::dtype::float16) {}
-// #else
-// PD_REGISTER_KERNEL(all_to_all,
-//                    GPU,
-//                    ALL_LAYOUT,
-//                    phi::AllToAllKernel,
-//                    float,
-//                    double,
-//                    int,
-//                    int8_t,
-//                    uint8_t,
-//                    int16_t,
-//                    int64_t,
-//                    bool,
-//                    phi::dtype::float16) {}
-// #endif
+#else
+PD_REGISTER_KERNEL(all_to_all,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AllToAllKernel,
+                   float,
+                   double,
+                   int,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 13a65c6a64f8b7..99ccfcd8667e6d 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -87,8 +87,6 @@ void AllCloseKernel(const Context& dev_ctx,
   grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
   hipMemset(out_data, true, sizeof(bool));
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemset(out_data, true, sizeof(bool));
 #else
   cudaMemset(out_data, true, sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index d7830ed8074a41..8e9d94bea6c547 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -17,9 +17,9 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
index 4127c6718f56fa..673e2937c93a5f 100644
--- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -18,8 +18,7 @@
 #include <thrust/sort.h>
 
 #include "paddle/phi/kernels/argsort_kernel.h"
-#if defined(__NVCC__) || defined(__MUSACC__)
-
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index ac3427fc2f9ac3..1fc367a5a88c64 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -18,7 +18,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index 449f163c02a9b6..5bc4b5a6283c20 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -214,23 +214,23 @@ void AucKernel(const Context &dev_ctx,
     }
   }
 
-#ifdef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_CUDA
   if (stat_pos_in_tensor != stat_pos_out) {
-    musaMemcpyAsync(
+    cudaMemcpyAsync(
         origin_stat_pos,
         pos_in_data,
         ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
             sizeof(int64_t),
-        musaMemcpyDeviceToDevice,
+        cudaMemcpyDeviceToDevice,
         dev_ctx.stream());
   }
   if (stat_neg_in_tensor != stat_neg_out) {
-    musaMemcpyAsync(
+    cudaMemcpyAsync(
         origin_stat_neg,
         neg_in_data,
         ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
             sizeof(int64_t),
-        musaMemcpyDeviceToDevice,
+        cudaMemcpyDeviceToDevice,
         dev_ctx.stream());
   }
 #else
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index f9fe32882a1250..c275f58ff734b9 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -587,7 +587,7 @@ void BatchNormGradFunctor(const Context &ctx,
           new_scale.dims()[0]));
 
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -659,7 +659,7 @@ void BatchNormGradFunctor(const Context &ctx,
     }
 
 // ------------------- cudnn descriptors ---------------------
-#if defined(PADDLE_WITH_HIP) ||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // miopenTensorDescriptor_t data_desc_;
 // miopenTensorDescriptor_t bn_param_desc_;
@@ -685,7 +685,7 @@ void BatchNormGradFunctor(const Context &ctx,
                  << "CUDNN_BN_MIN_EPSILON instead.";
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
@@ -704,7 +704,7 @@ void BatchNormGradFunctor(const Context &ctx,
     }
 #endif  // CUDNN_VERSION_MIN(7, 0, 1)
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
@@ -748,7 +748,7 @@ void BatchNormGradFunctor(const Context &ctx,
 
     // This branch calls CUDNN APIs
     if (d_x && d_scale && d_bias) {
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
       if (compute_format == DataLayout::kNCHW) {
         BNBackward<T, block, DataLayout::kNCHW>
             <<<grid2, block, 0, ctx.stream()>>>(
@@ -1126,7 +1126,7 @@ void BatchNormGradFunctor(const Context &ctx,
       }
     }
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
 // PADDLE_ENFORCE_GPU_SUCCESS(
@@ -1392,7 +1392,7 @@ void BatchNormDoubleGradKernel(
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_DECLARE_BN_GRAD_FUNCTOR(float, GPU);
 PD_DECLARE_BN_GRAD_FUNCTOR(phi::dtype::float16, GPU);
 
@@ -1444,7 +1444,7 @@ PD_REGISTER_KERNEL(batch_norm_grad,
 #endif
 #endif
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(batch_norm_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index 7d911a0e9a96aa..f01a4ee860d819 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -570,7 +570,7 @@ void BatchNormKernel(const Context &ctx,
     new_bias = phi::Full<T, Context>(ctx, {C}, static_cast<T>(0));
   }
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -602,7 +602,7 @@ void BatchNormKernel(const Context &ctx,
   }
 
 // ------------------- cudnn descriptors ---------------------
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // miopenTensorDescriptor_t data_desc_;
 // miopenTensorDescriptor_t bn_param_desc_;
@@ -630,7 +630,7 @@ void BatchNormKernel(const Context &ctx,
   }
   epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
@@ -660,7 +660,7 @@ void BatchNormKernel(const Context &ctx,
     strides = {H * W * D * C, 1, W * D * C, D * C, C};
   }
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
@@ -732,7 +732,7 @@ void BatchNormKernel(const Context &ctx,
             est_var->dims()[0],
             est_var->dims()));
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
     const int block_size = 256;
     const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
     if (compute_format == DataLayout::kNCHW) {
@@ -901,7 +901,7 @@ void BatchNormKernel(const Context &ctx,
       phi::Copy(ctx, x, ctx.GetPlace(), false, y);
     } else {
       double this_factor = 1. - momentum;
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
       this_factor = momentum;
       const int num = transformed_x.numel();
       const int block = 256;
@@ -1227,7 +1227,7 @@ void BatchNormKernel(const Context &ctx,
     VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
     TransToChannelLast<Context, T>(ctx, &transformed_y, y);
   }
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
 // PADDLE_ENFORCE_GPU_SUCCESS(
@@ -1245,7 +1245,7 @@ void BatchNormKernel(const Context &ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 89d16daf1e62bd..5f86201f4a5755 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -20,9 +20,7 @@
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
-#ifdef __MUSACC__
-#include <murand_kernel.h>
-#endif
+
 #include <algorithm>
 #include <vector>
 
@@ -42,9 +40,9 @@ __global__ void bernoulli_cuda_kernel(
   size_t thread_idx =
       static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
 
-#if defined(__MUSACC__)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, thread_idx, offset, &state);
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, thread_idx, offset, &state);
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, thread_idx, offset, &state);
diff --git a/paddle/phi/kernels/gpu/broadcast_kernel.cu b/paddle/phi/kernels/gpu/broadcast_kernel.cu
index 53907284442395..e4986f752b1aec 100644
--- a/paddle/phi/kernels/gpu/broadcast_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -33,7 +33,7 @@ void BroadcastKernel(const Context& dev_ctx,
       0,
       phi::errors::InvalidArgument("Tensor need be broadcast must not empty."));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   dev_ctx.template Alloc<T>(out);
   gpuStream_t stream = dev_ctx.stream();
   auto comm_context =
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index f3a357d55ce965..082574502d0dd8 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -399,12 +399,6 @@ static char* GetGpuHintStringPtr(const phi::GPUContext& ctx,
                                                 op_var.length() + 1,
                                                 hipMemcpyHostToDevice,
                                                 ctx.stream()));
-#elif defined(__MUSACC__)
-      PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(gpu_str_ptr,
-                                                iter->first.c_str(),
-                                                op_var.length() + 1,
-                                                musaMemcpyHostToDevice,
-                                                ctx.stream()));                                                
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr,
                                                  iter->first.c_str(),
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 588533dee6548a..16e854c8de4c6a 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/cholesky_kernel.h"
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
index c87432253cbd78..9be20c8025226c 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // backward reuse forward, HIP not support forward
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index be23f7acfb05c6..f350106f67cf8a 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index 00d92e96321ad1..af2901cd346f0a 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/cross_entropy_grad_kernel.h"
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -277,7 +277,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 58aa44e84767a2..63e52527cb9cdd 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -755,8 +755,6 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
-#elif defined(PADDLE_WITH_MUSA)
-    mudnnTensorDescriptor_t  descp = desc.descriptor<T>(layout, tensor_dims);
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
@@ -776,19 +774,6 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
         softmax_data,
         MIOPEN_SOFTMAX_LOG,
         mode));
-#elif defined(PADDLE_WITH_MUSA)
-    auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
-                                 : MUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
-        handle,
-        MUDNN_SOFTMAX_LOG,
-        mode,
-        phi::backends::gpu::CudnnDataType<T>::kOne(),
-        descp,
-        logits_data,
-        phi::backends::gpu::CudnnDataType<T>::kZero(),
-        descp,
-        softmax_data));
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
@@ -1202,8 +1187,6 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
-#elif defined(PADDLE_WITH_MUSA)
-    mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);    
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
@@ -1223,19 +1206,6 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
         softmax_data,
         MIOPEN_SOFTMAX_LOG,
         mode));
-#elif defined(PADDLE_WITH_MUSA)
-    auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
-                                 : MUDNN_SOFTMAX_MODE_CHANNEL;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
-        handle,
-        MUDNN_SOFTMAX_LOG,
-        mode,
-        phi::backends::gpu::CudnnDataType<T>::kOne(),
-        descp,
-        logits_data,
-        phi::backends::gpu::CudnnDataType<T>::kZero(),
-        descp,
-        softmax_data));
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
@@ -1478,7 +1448,7 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(cross_entropy_with_softmax,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
index 80ca363a958a48..831b225c61d844 100644
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -229,8 +229,6 @@ ThrustCumsumKernel(const Context& dev_ctx,
                    bool exclusive) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
-#elif defined(__MUSACC__)
-  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   phi::memory_utils::ThrustAllocator<cudaStream_t> allocator(dev_ctx.GetPlace(),
                                                              dev_ctx.stream());
diff --git a/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu b/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu
index f8dc67f5bafe88..d7341e55e23490 100644
--- a/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_maxmin_grad_kernel.cu
@@ -39,10 +39,10 @@ void CummaxGradKernel(const Context& dev_ctx,
 
   if (dtype == DataType::INT32) {
     phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
@@ -63,10 +63,10 @@ void CumminGradKernel(const Context& dev_ctx,
 
   if (dtype == DataType::INT32) {
     phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   } else if (dtype == DataType::INT64) {
     phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, indices, out_grad, dev_ctx);
+        *x_grad, axis, indices, out_grad, true, dev_ctx);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index 66503c2f446bf6..fdd9b4ba499146 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -181,8 +181,6 @@ void CumprodGradKernel(const Context &dev_ctx,
 // Step 1: find cummax-ed zero mask of x
 #ifdef PADDLE_WITH_CUDA
   const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
-#elif defined (PADDLE_WITH_MUSA)
-  const auto &exec_policy = thrust::musa::par.on(dev_ctx.stream());  
 #else
   const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
index 9690a02544563d..ef6ce5d159aeb6 100644
--- a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+++ b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
 
 #include "paddle/phi/kernels/decode_jpeg_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
index 1dd6988b15810a..5becc79d218f44 100644
--- a/paddle/phi/kernels/gpu/dgc_kernel.cu
+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
@@ -183,7 +183,7 @@ void DGCKernel(const Context& dev_ctx,
 
   int buf_size = paddle::communication::dgc::get_buffer_size(k);
   phi::Allocator::AllocationPtr tmp_ious_data;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
     tmp_ious_data = phi::memory_utils::Alloc(
         dev_ctx.GetPlace(),
diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
index 11398a4c69520a..912c84bf26c210 100644
--- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu
+++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
@@ -28,11 +28,6 @@
 #ifdef PADDLE_WITH_CUDA
 #include <curand_kernel.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <murand_kernel.h>
-#endif
-
 #ifdef PADDLE_WITH_HIP
 #include <hiprand_kernel.h>
 #endif
@@ -42,13 +37,6 @@ using COMPAT_RANDSTATEPHILOX4_32_10_T = curandStatePhilox4_32_10_t;
 #define COMPAT_RAND_INIT curand_init
 #define COMPAT_RAND_UNIFORM curand_uniform
 #define COMPAT_RAND_NORMAL curand_normal
-
-#elif defined(PADDLE_WITH_MUSA)
-using COMPAT_RANDSTATEPHILOX4_32_10_T = murandStatePhilox4_32_10_t;
-#define COMPAT_RAND_INIT murand_init
-#define COMPAT_RAND_UNIFORM murand_uniform
-#define COMPAT_RAND_NORMAL murand_normal
-
 #elif defined(PADDLE_WITH_HIP)
 using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
 #define COMPAT_RAND_INIT hiprand_init
diff --git a/paddle/phi/kernels/gpu/dist_concat_kernel.cu b/paddle/phi/kernels/gpu/dist_concat_kernel.cu
index eb8cca95a3290b..75500f06299b36 100644
--- a/paddle/phi/kernels/gpu/dist_concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_concat_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -28,7 +28,7 @@ void DistConcatKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       int nranks,
                       DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   DenseTensor temp_out;
   auto temp_out_dims = x.dims();
   temp_out_dims[0] *= nranks;
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index a12ed3663ddd32..a9cbf97b975f22 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/legacy/reduce_max_kernel.h"
 #include "paddle/phi/kernels/p_norm_kernel.h"
 #include "paddle/phi/kernels/reduce_min_kernel.h"
-#include "paddle/phi/common/amp_type_traits.h"
+
 namespace phi {
 
 #define FULL_MASK 0xffffffff
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index 9e389dd3896681..092d2428640c89 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
index 1700485668bed0..f3b33ad5c98785 100644
--- a/paddle/phi/kernels/gpu/eigh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/eigh_kernel.h"
diff --git a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
index db2cb7ec5a1558..9671cc9f3e8d98 100644
--- a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
 #include "paddle/phi/kernels/eigvalsh_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 0cc240c4bf9d5b..8689f7fde8b3ba 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -99,9 +99,6 @@ struct EmbeddingGradCUDAFunctor {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
-#elif defined(PADDLE_WITH_MUSA)
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream())); 
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index f3cc5a1c80db7e..ce2f8dc2467ed0 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -16,7 +16,7 @@
 
 #include <algorithm>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index dd78ce97bfcc4b..c0454619b657ca 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -72,8 +72,6 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
   int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
 #ifdef PADDLE_WITH_HIP
   hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #else
   cudaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #endif
@@ -95,11 +93,6 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
             item_count_ptr + num_input,
             sizeof(int),
             hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemcpy(&total_unique_items,
-            item_count_ptr + num_input,
-            sizeof(int),
-            musaMemcpyDeviceToHost);       
 #else
   cudaMemcpy(&total_unique_items,
              item_count_ptr + num_input,
@@ -351,11 +344,6 @@ void ReindexDst(const Context& dev_ctx,
               thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
               sizeof(int),
               hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemcpy(&count_i,
-              thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
-              sizeof(int),
-              musaMemcpyDeviceToHost); 
 #else
     cudaMemcpy(&count_i,
                thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index f9ecf4bf17f9d8..20e1c6727ae91e 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -22,9 +22,6 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
-#elif defined(PADDLE_WITH_MUSA)
-#include <musa_runtime.h>
-#include <murand_kernel.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -85,12 +82,6 @@ __global__ void SampleKernel(const uint64_t rand_seed,
                threadIdx.y * CTA_SIZE + threadIdx.x,
                0,
                &rng);
-#elif defined(PADDLE_WITH_MUSA)
-  murandState rng;
-  murand_init(rand_seed * gridDim.x + blockIdx.x,
-               threadIdx.y * CTA_SIZE + threadIdx.x,
-               0,
-               &rng);               
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -127,8 +118,6 @@ __global__ void SampleKernel(const uint64_t rand_seed,
       for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
-#elif defined(PADDLE_WITH_MUSA)
-        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
@@ -224,10 +213,6 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   hiprandState rng;
   hiprand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
-#elif defined(PADDLE_WITH_MUSA)
-  murandState rng;
-  murand_init(
-      rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(
@@ -251,8 +236,6 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
       for (int idx = split; idx <= deg - 1; idx++) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
-#elif defined(PADDLE_WITH_MUSA)
-        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
index b53a85b327e65c..bff91078865d92 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -42,15 +42,6 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info,
             bcast_info.r_offset.data(),
             sizeof(int64_t) * bcast_info.out_len,
             hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
-            bcast_info.l_offset.data(),
-            sizeof(int64_t) * bcast_info.out_len,
-            musaMemcpyHostToDevice);
-  musaMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()),
-            bcast_info.r_offset.data(),
-            sizeof(int64_t) * bcast_info.out_len,
-            musaMemcpyHostToDevice);
 #else
   cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
              bcast_info.l_offset.data(),
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index 0e26e9e6c68fb7..301701c61d34ea 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -674,8 +674,6 @@ void GroupNormNHWCKernel(const Context& dev_ctx,
   params_.redBuffer = dev_ctx.template Alloc<float>(&redBuffer);
 #ifdef PADDLE_WITH_HIP
   hipMemset(params_.redBuffer, 0, buffer_sizes * sizeof(float));
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemset(params_.redBuffer, 0, buffer_sizes * sizeof(float));
 #else
   cudaMemset(params_.redBuffer, 0, buffer_sizes * sizeof(float));
 #endif
@@ -689,12 +687,6 @@ void GroupNormNHWCKernel(const Context& dev_ctx,
                                      params_.n * groups * sizeof(float),
                                      hipMemcpyDeviceToHost,
                                      stream);
-#elif defined(PADDLE_WITH_MUSA)           
-  phi::backends::gpu::GpuMemcpyAsync(mean_data,
-                                     params_.redBuffer,
-                                     params_.n * groups * sizeof(float),
-                                     musaMemcpyDeviceToHost,
-                                     stream);                          
 #else
   phi::backends::gpu::GpuMemcpyAsync(mean_data,
                                      params_.redBuffer,
@@ -856,9 +848,6 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
 #ifdef PADDLE_WITH_HIP
     hipMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     hipMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
-    musaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
 #else
     cudaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
@@ -893,7 +882,7 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
                                      data_layout);
 }
 template class GroupNormDirectCUDAFunctor<float, float>;
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
 template class GroupNormDirectCUDAFunctor<half, float>;
 #endif
 
diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h
index 555403fcf115ad..3ea5f3bc1088d0 100644
--- a/paddle/phi/kernels/gpu/group_norm_utils.h
+++ b/paddle/phi/kernels/gpu/group_norm_utils.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 708c82d9d5c932..34c034c73677b8 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -18,9 +18,8 @@
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
-#if defined(__NVCC__) || defined(__MUSACC__)
-
+#if defined(__NVCC__) || defined(__HIPCC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
index 10fdaabcedd467..865ab91da7b1b3 100644
--- a/paddle/phi/kernels/gpu/instance_norm_utils.h
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -18,7 +18,7 @@
 #include <cfloat>
 #include <string>
 #include <vector>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index 485f06f8e7f8b9..f596859fd2d575 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -1035,7 +1035,7 @@ static void Interpolate2DCUDABwd(
         (align_mode == 0 && !align_corners) ? 0.5f : 0.f;
     bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false;
     bool optimize_flag = false;
-#if !defined(__HIPCC__) && !defined(__MUSACC__)
+#ifndef __HIPCC__
     optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6))
                         ? true
                         : ((in_h == 1 && in_w == 1) ? true : false);
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index 7dd6d225d5cbec..db93d800d4ee05 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -86,11 +86,11 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                                                0,
                                                sizeof(T) * 8,
                                                cu_stream);
-#ifdef __MUSACC__
-  if (err != musaSuccess) {
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
                   "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << musaGetErrorString(err);
+               << hipGetErrorString(err);
     return false;
   }
 #else
@@ -118,11 +118,11 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                                                  0,
                                                  sizeof(T) * 8,
                                                  cu_stream);
-#ifdef __MUSACC__
-  if (err != musaSuccess) {
+#ifdef __HIPCC__
+  if (err != hipSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
                   "hipcub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << musaGetErrorString(err);
+               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
     return false;
   }
 #else
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 9314e456aeca73..d9757183b289c8 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -482,7 +482,7 @@ void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
 
 template class LayerNormDirectCUDAFunctor<float, float>;
 template class LayerNormDirectCUDAFunctor<double, double>;
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
 template class LayerNormDirectCUDAFunctor<half, float>;
 #endif
 
diff --git a/paddle/phi/kernels/gpu/logsumexp_function.cu.h b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
index 91b0a29b4ac074..53b6fb6d2b20d0 100644
--- a/paddle/phi/kernels/gpu/logsumexp_function.cu.h
+++ b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
@@ -69,22 +69,6 @@ inline void GetNumBlocks(int64_t block_size,
   *num_blocks = std::max<int>(
       1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
 }
-#elif defined(PADDLE_WITH_MUSA)
-inline void GetNumBlocks(int64_t block_size,
-                         int64_t max_blocks,
-                         int64_t waves,
-                         int* num_blocks) {
-  int dev;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&dev));
-  int sm_count;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
-      &sm_count, musaDevAttrMultiProcessorCount, dev));
-  int tpm;
-  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
-      &tpm, musaDevAttrMaxThreadsPerMultiProcessor, dev));
-  *num_blocks = std::max<int>(
-      1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
-}
 #else
 inline void GetNumBlocks(int64_t block_size,
                          int64_t max_blocks,
@@ -209,12 +193,6 @@ inline hipError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                       const int64_t num_col,
                                       const SourceType* in,
                                       SourceType* out) {
-#elif defined(PADDLE_WITH_MUSA)
-inline musaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
-                                      const int64_t num_row,
-                                      const int64_t num_col,
-                                      const SourceType* in,
-                                      SourceType* out) {                                        
 #else
 inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                        const int64_t num_row,
@@ -244,8 +222,6 @@ inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
           dev_ctx, num_row, num_col, in, out);
 #if PADDLE_WITH_HIP
   return hipPeekAtLastError();
-#elif defined(PADDLE_WITH_MUSA)
-  return musaPeekAtLastError();
 #else
   return cudaPeekAtLastError();
 #endif
@@ -264,12 +240,6 @@ inline hipError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                    const int64_t num_col,
                                                    const SourceType* in,
                                                    SourceType* out) {
-#elif defined(PADDLE_WITH_MUSA)
-inline musaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
-                                                   const int64_t num_row,
-                                                   const int64_t num_col,
-                                                   const SourceType* in,
-                                                   SourceType* out) {                                                    
 #else
 inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                     const int64_t num_row,
@@ -301,13 +271,6 @@ inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
 template <typename T, typename SourceType, typename Context, int VecSize>
 #if PADDLE_WITH_HIP
 typename std::enable_if<VecSize == 1, hipError_t>::type
-DispatchLogsumexpWarpCols(const Context& dev_ctx,
-                          const int64_t num_row,
-                          const int64_t num_col,
-                          const SourceType* in,
-                          SourceType* out) {
-#elif defined(PADDLE_WITH_MUSA)                            
-typename std::enable_if<VecSize == 1, musaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_row,
                           const int64_t num_col,
@@ -324,8 +287,6 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
-#elif defined(PADDLE_WITH_MUSA)
-    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -406,8 +367,6 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
-#elif defined(PADDLE_WITH_MUSA)
-  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
@@ -421,13 +380,6 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_col,
                           const SourceType* in,
                           SourceType* out) {
-#elif defined(PADDLE_WITH_MUSA)        
-typename std::enable_if<VecSize == 2, musaError_t>::type
-DispatchLogsumexpWarpCols(const Context& dev_ctx,
-                          const int64_t num_row,
-                          const int64_t num_col,
-                          const SourceType* in,
-                          SourceType* out) {                    
 #else
 typename std::enable_if<VecSize == 2, cudaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
@@ -439,8 +391,6 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
-#elif defined(PADDLE_WITH_MUSA)
-    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -505,8 +455,6 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
-#elif defined(PADDLE_WITH_MUSA)
-  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
@@ -519,12 +467,6 @@ inline hipError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                         const int64_t num_col,
                                         const SourceType* in,
                                         SourceType* out) {
-#elif defined(PADDLE_WITH_MUSA)
-inline musaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
-                                        const int64_t num_row,
-                                        const int64_t num_col,
-                                        const SourceType* in,
-                                        SourceType* out) {                                          
 #else
 inline cudaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                          const int64_t num_row,
diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
index 9faf835af8a987..85db2de74e6fdd 100644
--- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
+++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA)   // HIP not support cusolver
+#ifndef PADDLE_WITH_HIP  // HIP not support cusolver
 
 #include <math.h>
 #include <algorithm>
diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu
index 895249b3b20f01..f509e0a173161b 100644
--- a/paddle/phi/kernels/gpu/lu_kernel.cu
+++ b/paddle/phi/kernels/gpu/lu_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
index fb256713a768fb..9727e2cc114c54 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/matrix_rank_kernel.h"
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index c8761eabab8f03..33de3c8e178767 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
diff --git a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
index fa9c67ae4acc4d..531e30a880a48b 100644
--- a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
 #include "paddle/phi/kernels/multiclass_nms3_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index f654f91adef818..635e9189b7d89a 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/multinomial_kernel.h"
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -103,9 +103,9 @@ __global__ void sampleMultinomialWithReplacement(
   size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x +
                threadIdx.x;
 
-#if defined(__MUSACC__)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, idx, offset, &state);
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx, offset, &state);
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx, offset, &state);
@@ -114,8 +114,8 @@ __global__ void sampleMultinomialWithReplacement(
   int sample = blockIdx.x * blockDim.x + threadIdx.x;
   for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
     if (sample < num_samples) {
-#if defined(__MUSACC__)
-      T rng_number = static_cast<T>(murand_uniform4(&state).x);
+#if defined(__NVCC__)
+      T rng_number = static_cast<T>(curand_uniform4(&state).x);
 #else
       T rng_number = static_cast<T>(hiprand_uniform4(&state).x);
 #endif
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 4e5a2942d6b3bc..7895983236f91d 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -36,8 +36,6 @@ void NllLossGradKernel(const Context& dev_ctx,
   auto total_weight_data = total_weight.data<T>();
 #ifdef PADDLE_WITH_HIP
   hipMemset(dx_data, 0, dx->numel() * sizeof(T));
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #else
   cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index b3da3675d00414..1e80eb9bb460e1 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -37,8 +37,6 @@ void NllLossRawKernel(const Context& dev_ctx,
   auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
 #ifdef PADDLE_WITH_HIP
   hipMemset(total_weight_data, 0, sizeof(T));
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemset(total_weight_data, 0, sizeof(T));  
 #else
   cudaMemset(total_weight_data, 0, sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/nonzero_kernel.cu b/paddle/phi/kernels/gpu/nonzero_kernel.cu
index 27c89538687f1a..65cdcd3d6a058d 100644
--- a/paddle/phi/kernels/gpu/nonzero_kernel.cu
+++ b/paddle/phi/kernels/gpu/nonzero_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/nop_kernel.cu b/paddle/phi/kernels/gpu/nop_kernel.cu
index 6e392afae0727f..7bdf7be4d58dec 100644
--- a/paddle/phi/kernels/gpu/nop_kernel.cu
+++ b/paddle/phi/kernels/gpu/nop_kernel.cu
@@ -16,6 +16,6 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(nop, GPU, ALL_LAYOUT, phi::NopKernel, float) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index c0bac0d64a0773..2196c714542462 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/norm_grad_kernel.h"
 
 #include <algorithm>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index bc6e69555ec92e..7e519bcc2804cf 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/norm_kernel.h"
 
 #include <algorithm>
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/p_recv_kernel.cu b/paddle/phi/kernels/gpu/p_recv_kernel.cu
index a6ae0406050ef8..b6fd090173260f 100644
--- a/paddle/phi/kernels/gpu/p_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_recv_kernel.cu
@@ -21,14 +21,15 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_MCCL)||defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
 namespace phi {
 
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
 template <typename Context>
 DDim recv_shape_info(const Context& dev_ctx,
                      phi::DenseTensor* out,
@@ -41,7 +42,7 @@ DDim recv_shape_info(const Context& dev_ctx,
                         "NCCLComm and Stream should be provided if use NCCL "
                         "to send the shape info."));
   paddle::DataType shape_dtype = paddle::DataType::INT32;
-  mcclDataType_t nccl_dtype = mcclInt;
+  ncclDataType_t nccl_dtype = ncclInt;
 
   // phi::DenseTensor gpu_shape_size_tensor(shape_dtype);
   phi::DenseTensor* gpu_shape_size_tensor = new phi::DenseTensor(shape_dtype);
@@ -129,8 +130,8 @@ void PRecvKernel(const Context& dev_ctx,
                  DataType dtype,
                  bool dynamic_shape,
                  DenseTensor* out) {
-#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL)  
+#if defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
 
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
@@ -155,8 +156,8 @@ void PRecvArrayKernel(const Context& dev_ctx,
                       DataType dtype,
                       const std::vector<int>& out_shape,
                       TensorArray* out_array) {
-#if defined(PADDLE_WITH_MCCL)  || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) 
+#if defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
 
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
diff --git a/paddle/phi/kernels/gpu/p_send_kernel.cu b/paddle/phi/kernels/gpu/p_send_kernel.cu
index 15b87ee056a5df..efbb69afcdab75 100644
--- a/paddle/phi/kernels/gpu/p_send_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_send_kernel.cu
@@ -21,14 +21,15 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
-#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
 namespace phi {
 
-#if defined(PADDLE_WITH_MCCL)|| (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL))
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
+    NCCL_VERSION_CODE >= 2703
 template <typename Context>
 void send_shape_info(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -41,7 +42,7 @@ void send_shape_info(const Context& dev_ctx,
                         "NCCLComm and Stream should be provided if use NCCL "
                         "to send the shape info."));
   paddle::DataType shape_dtype = paddle::DataType::INT32;
-  mcclDataType_t nccl_dtype = mcclInt;
+  ncclDataType_t nccl_dtype = ncclInt;
   auto dims = x.dims();
   int shape_size = dims.size();
 
@@ -123,8 +124,8 @@ void PSendKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  int peer,
                  bool dynamic_shape) {
-#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
   if (dynamic_shape) {
@@ -143,8 +144,8 @@ template <typename T, typename Context>
 void PSendArrayKernel(const Context& dev_ctx,
                       const TensorArray& x_array,
                       int peer) {
-#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) 
+#if defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
 
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
@@ -152,7 +153,7 @@ void PSendArrayKernel(const Context& dev_ctx,
     VLOG(3) << "LodTensorArray: idx(" << idx << ")";
     auto x = x_array.at(idx);
     int numel = x.numel();
-    mcclDataType_t dtype = ToNCCLDataType(x.type());
+    ncclDataType_t dtype = ToNCCLDataType(x.type());
     comm_ctx->Send(x, x.numel(), peer, stream);
     VLOG(3) << "rank " << comm_ctx->GetRank() << " send "
             << common::product(x.dims()) << " to " << peer;
diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu
index c73c8a9a23f09b..1d1968b30ae6ef 100644
--- a/paddle/phi/kernels/gpu/poisson_kernel.cu
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -18,9 +18,7 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
-#ifdef __MUSACC__
-#include <murand_kernel.h>
-#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -33,14 +31,14 @@ template <typename T>
 __global__ void GetPoisson(
     const T* in, T* out, const int N, unsigned int seed, unsigned int offset) {
   CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
-#ifdef __MUSACC__
-    murandStatePhilox4_32_10_t state;
-    murand_init(seed, idx, offset, &state);
-    out[idx] = static_cast<T>(murand_poisson(&state, in[idx]));
-#elif __MUSACC__
-    murandStatePhilox4_32_10_t state;
-    murand_init(seed, idx, offset, &state);
-    out[idx] = static_cast<T>(murand_poisson(&state, in[idx]));
+#ifdef __NVCC__
+    curandStatePhilox4_32_10_t state;
+    curand_init(seed, idx, offset, &state);
+    out[idx] = static_cast<T>(curand_poisson(&state, in[idx]));
+#elif __HIPCC__
+    hiprandStatePhilox4_32_10_t state;
+    hiprand_init(seed, idx, offset, &state);
+    out[idx] = static_cast<T>(hiprand_poisson(&state, in[idx]));
 #endif
   }
 }
diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
index d86e0493786ebd..c70812b473ee62 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu
@@ -27,9 +27,12 @@ template <typename T, typename Context>
 void PutAlongAxisGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& index,
+                            const DenseTensor& value,
+                            const DenseTensor& out,
                             const DenseTensor& out_grad,
                             int axis,
                             const std::string& reduce,
+                            bool include_self,
                             DenseTensor* x_grad,
                             DenseTensor* value_grad) {
   PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
@@ -40,23 +43,118 @@ void PutAlongAxisGradKernel(const Context& dev_ctx,
   const auto& index_type = index.dtype();
   if (x_grad) {
     phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::gpu_scatter_input_grad_kernel<T, int32_t>(
-          out_grad, axis, index, *x_grad, dev_ctx);
-    } else {
-      phi::funcs::gpu_scatter_input_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *x_grad, dev_ctx);
+    if (!include_self || reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_input_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "multiply" || reduce == "mul" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_mul_min_max_input_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_mul_min_max_input_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *x_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_mean_input_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_mean_input_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *x_grad, include_self, dev_ctx);
+      }
     }
   }
   if (value_grad) {
     value_grad->Resize(index.dims());
     dev_ctx.template Alloc<T>(value_grad);
-    if (index_type == DataType::INT32) {
-      phi::funcs::gpu_scatter_value_grad_kernel<T, int32_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
-    } else {
-      phi::funcs::gpu_scatter_value_grad_kernel<T, int64_t>(
-          out_grad, axis, index, *value_grad, dev_ctx);
+    auto* grad_data = value_grad->data<T>();
+    int64_t grad_size = value_grad->numel();
+    cudaMemset(grad_data, 0, sizeof(T) * grad_size);
+    if (reduce == "assign") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_value_grad_kernel<T, int32_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      } else if (index_type == DataType::INT64) {
+        phi::funcs::gpu_scatter_value_grad_kernel<T, int64_t>(
+            out_grad, axis, index, *value_grad, include_self, dev_ctx);
+      }
+    } else if (reduce == "add" || reduce == "mean") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_add_mean_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_add_mean_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
+    } else if (reduce == "mul" || reduce == "multiply" || reduce == "amin" ||
+               reduce == "amax") {
+      if (index_type == DataType::INT32) {
+        phi::funcs::gpu_scatter_mul_min_max_value_grad_kernel<T, int32_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      } else {
+        phi::funcs::gpu_scatter_mul_min_max_value_grad_kernel<T, int64_t>(
+            out_grad,
+            axis,
+            index,
+            out,
+            x,
+            value,
+            *value_grad,
+            reduce,
+            include_self,
+            dev_ctx);
+      }
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index b63047973e9b82..aff4eec7bff8dd 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -30,6 +30,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                         const DenseTensor& value,
                         int axis,
                         const std::string& reduce,
+                        bool include_self,
                         DenseTensor* out) {
   PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU,
                     true,
@@ -42,31 +43,56 @@ void PutAlongAxisKernel(const Context& dev_ctx,
   if (reduce == "add") {
     if (index_type == DataType::INT32) {
       phi::funcs::gpu_scatter_add_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "multiply" || reduce == "mul") {
     if (index_type == DataType::INT32) {
       phi::funcs::gpu_scatter_mul_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::gpu_scatter_mul_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else if (reduce == "assign") {
     if (index_type == DataType::INT32) {
       phi::funcs::gpu_scatter_assign_kernel<T, int32_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
     } else if (index_type == DataType::INT64) {
       phi::funcs::gpu_scatter_assign_kernel<T, int64_t>(
-          *out, axis, index, value, dev_ctx);
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "mean") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::gpu_scatter_mean_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::gpu_scatter_mean_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amax") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::gpu_scatter_max_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::gpu_scatter_max_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    }
+  } else if (reduce == "amin") {
+    if (index_type == DataType::INT32) {
+      phi::funcs::gpu_scatter_min_kernel<T, int32_t>(
+          *out, axis, index, value, include_self, dev_ctx);
+    } else if (index_type == DataType::INT64) {
+      phi::funcs::gpu_scatter_min_kernel<T, int64_t>(
+          *out, axis, index, value, include_self, dev_ctx);
     }
   } else {
     PADDLE_THROW(errors::InvalidArgument(
         "can not support reduce: '%s' for scatter kernel, only "
-        "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
+        "support reduce op: 'add', 'assign', 'mul', 'mean', 'amin', 'amax' and "
+        "'multiply', the "
         "default reduce op is 'assign' ",
         reduce));
     return;
diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu
index 639c2c4632cda4..5bbb2ef158aa1a 100644
--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA)   // HIP not support cusolver
+#ifndef PADDLE_WITH_HIP  // HIP not support cusolver
 
 #include <thrust/device_vector.h>
 #include <algorithm>
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index 9cccf80c3cc4f3..f439336cc1e709 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -13,17 +13,12 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/randperm_kernel.h"
-#ifdef __MUSACC__
-#include <murand_kernel.h>
-#include "cub/cub.cuh"
-#endif
+
 #ifdef __NVCC__
 #include <curand_kernel.h>
 
 #include "cub/cub.cuh"
 #endif
-
-
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 
@@ -76,11 +71,11 @@ __global__ void SwapRepeatKernel(keyT* key_out_data,
   curand_init(seed, idx, offset, &state);
   for (int i = repeat_size - 1; i > 0; i--) {
     uint32_t r = curand(&state) % (i + 1);
-#elif defined(__MUSACC__)
-  murandStatePhilox4_32_10_t state;
-  murand_init(seed, idx, offset, &state);
+#elif __HIPCC__
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx, offset, &state);
   for (int i = repeat_size - 1; i > 0; i--) {
-    uint32_t r = murand(&state) % (i + 1);
+    uint32_t r = hiprand(&state) % (i + 1);
 #endif
     if (r != i) {
       dataT tmp = out_data[idx + i];
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index c74ce697da35ca..79c7381edab192 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -16,7 +16,7 @@
 
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_MUSA) 
+    defined(PADDLE_WITH_XPU_KP)
 
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index 943e32c96dd929..0a01fe1ff1aab4 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
index dc0f09240ec2e3..51b50ed6e00248 100644
--- a/paddle/phi/kernels/gpu/reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/reduce_kernel.h"
 
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/gpu/reduce_amin_amax_common.h"
 #include "paddle/phi/kernels/reduce_amin_grad_kernel.h"
 #include "paddle/phi/kernels/reduce_max_grad_kernel.h"
@@ -33,7 +32,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -242,7 +241,7 @@ void ReduceKernel(const Context& dev_ctx,
       x.numel(),
       0,
       phi::errors::InvalidArgument("Tensor need be reduced must not empty."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   out->Resize(x.dims());
   dev_ctx.template Alloc<T>(out);
 
@@ -257,22 +256,20 @@ void ReduceKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_NOT_NULL(stream,
                           errors::NotFound("Should initialize NCCL firstly."));
 
-  mcclRedOp_t red_type = mcclSum;
+  ncclRedOp_t red_type = ncclSum;
   switch (static_cast<ReduceType>(reduce_type)) {
     case ReduceType::kRedSum:
-      red_type = mcclSum;
+      red_type = ncclSum;
       break;
     case ReduceType::kRedMax:
-      red_type = mcclMax;
+      red_type = ncclMax;
       break;
     case ReduceType::kRedMin:
-      red_type = mcclMin;
+      red_type = ncclMin;
       break;
     case ReduceType::kRedProd:
-      red_type = mcclProd;
+      red_type = ncclProd;
       break;
-    default:
-      PADDLE_ENFORCE(false, "not supported!");
   }
   comm_ctx->Reduce(out, x, red_type, root, stream);
 #else
diff --git a/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu b/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu
index 54ead64dc047b2..68cf339ada75b8 100644
--- a/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -28,7 +28,7 @@ void ReduceScatterKernel(const Context& dev_ctx,
                          const DenseTensor& x,
                          int nranks,
                          DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   gpuStream_t stream = dev_ctx.stream();
   auto comm_context =
       static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
@@ -50,7 +50,7 @@ void ReduceScatterKernel(const Context& dev_ctx,
 
   out->Resize(out_dims);
   dev_ctx.template Alloc<T>(out);
-  comm_context->ReduceScatter(out, x, mcclSum, stream);
+  comm_context->ReduceScatter(out, x, ncclSum, stream);
 #else
   PADDLE_THROW(
       errors::PreconditionNotMet("PaddlePaddle should compile with GPU."));
diff --git a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu
index 52a0e313398e8b..5ff1418b2732ad 100644
--- a/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/repeat_interleave_grad_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 PD_REGISTER_KERNEL(repeat_interleave_grad,
                    GPU,
                    ALL_LAYOUT,
@@ -33,4 +34,5 @@ PD_REGISTER_KERNEL(repeat_interleave_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
index ed62278f067e5f..7b0675b3a752df 100644
--- a/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
+++ b/paddle/phi/kernels/gpu/repeat_interleave_kernel.cu
@@ -25,7 +25,8 @@ PD_REGISTER_KERNEL(repeat_interleave,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    GPU,
@@ -34,4 +35,5 @@ PD_REGISTER_KERNEL(repeat_interleave_with_tensor_index,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/rms_norm_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
index 8bb4226fe4f4ff..da8bce8996b9e3 100644
--- a/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
@@ -38,9 +38,8 @@ limitations under the License.
 #include <assert.h>
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 #include <cub/cub.cuh>
 #endif
 
@@ -48,7 +47,7 @@ namespace phi {
 
 namespace {
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
 constexpr int kWarpSize = 32;
 
@@ -950,8 +949,8 @@ void RmsNormKernel(const Context& dev_ctx,
                    const float quant_min_bound,
                    DenseTensor* out,
                    DenseTensor* residual_out) {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
-  PADDLE_ENFORCE(false, "not supported");
+#if defined(PADDLE_WITH_HIP)
+  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
 #else
   using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
 
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index 82902b9df40ba5..359218bbcb75f1 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -25,10 +25,6 @@ namespace phi {
 using gpuRNNMode_t = miopenRNNMode_t;
 using gpuDnnHandle_t = miopenHandle_t;
 using gpuDnnDataType_t = miopenDataType_t;
-#elif defined(PADDLE_WITH_MUSA)
-// using gpuRNNMode_t = mudnnRNNMode_t;
-// using gpuDnnHandle_t = mudnnHandle_t;
-// using gpuDnnDataType_t = mudnnDataType_t;
 #else
 using gpuRNNMode_t = cudnnRNNMode_t;
 using gpuDnnHandle_t = cudnnHandle_t;
@@ -117,20 +113,6 @@ class RNNDescriptors {
                              dropout_state,
                              seed_,
                              state_size);
-#elif defined(PADDLE_WITH_MUSA)  
-    if (!is_initialized) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::mudnnDropoutGetStatesSize(handle, &state_size));
-      dropout_state->Resize({static_cast<int64_t>(state_size)});
-      dev_ctx.template Alloc<uint8_t>(dropout_state);
-    }
-    dropout_desc_.descriptor(handle,  // NOLINT
-                             dev_ctx.GetPlace(),
-                             is_initialized,
-                             dropout_prob_,
-                             dropout_state,
-                             seed_,
-                             state_size);            
 #else
     // Note(lvyongkang): delete `is_initialized` in condition, cause this will
     // lead to bug in PIR mode, where rnn op has an input named
@@ -166,18 +148,6 @@ class RNNDescriptors {
         miopenRNNwithBias,
         miopenRNNdefault,
         cudnn_type));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetRNNDescriptor_V2(
-        rnn_desc_.desc(),
-        hidden_size_,
-        num_layers_,
-        dropout_desc_.desc(),
-        mudnnRNNlinear,
-        is_bidirec_ ? mudnnRNNbidirection : mudnnRNNunidirection,
-        mode_,
-        mudnnRNNwithBias,
-        mudnnRNNdefault,
-        cudnn_type));
 #elif CUDNN_VERSION >= 6000
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor_v6(
         handle,
@@ -214,9 +184,6 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNParamsSize(
-        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
@@ -241,15 +208,6 @@ class RNNDescriptors {
                                                 workspace_size));
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::mudnnGetRNNWorkspaceSize(handle,
-                                                rnn_desc_.desc(),
-                                                seq_length_,
-                                                x_descs_.data(),
-                                                workspace_size));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNTrainingReserveSize(
-        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnGetRNNWorkspaceSize(handle,
@@ -271,16 +229,6 @@ class RNNDescriptors {
   miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
   miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
   miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
-#elif defined(PADDLE_WITH_MUSA)
-  mudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
-  mudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
-  mudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
-  mudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
-  mudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
-  mudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
-  mudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
-  mudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
-  mudnnTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
 #else
   cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
   cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
@@ -312,9 +260,6 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
   std::vector<miopenTensorDescriptor_t> x_descs_;
   std::vector<miopenTensorDescriptor_t> y_descs_;
-#elif defined(PADDLE_WITH_MUSA)
-  std::vector<mudnnTensorDescriptor_t> x_descs_;
-  std::vector<mudnnTensorDescriptor_t> y_descs_;
 #else
   std::vector<cudnnTensorDescriptor_t> x_descs_;
   std::vector<cudnnTensorDescriptor_t> y_descs_;
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 4de5f0f0ce0b65..82800607bae9de 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -246,7 +246,7 @@ void RnnKernel(const Context &dev_ctx,
     WeightToTensor<T>(place, stream, weight_list, &weight_whole);
 #endif
     w_data = weight_whole.data<T>();
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
     // MIOPEN need to permute weight, do not share with weight_grad
     if (is_test) {  // maybe also reset small weights' ptr for training
       int offset = 0;
diff --git a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
index 3692504ce9c82b..b9c4a8daf2326c 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
@@ -49,8 +49,6 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 
 #ifdef PADDLE_WITH_HIP
   hipMemset(p_output, 0, memset_bytes);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemset(p_output, 0, memset_bytes);
 #else
   cudaMemset(p_output, 0, memset_bytes);
 #endif
diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
index b4b03e2d11402d..d4a08a72d80a98 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
@@ -124,8 +124,6 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
 
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_dst_count, 0, input_size * sizeof(int));
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemsetAsync(p_dst_count, 0, input_size * sizeof(int), ctx.stream());
 #else
     cudaMemsetAsync(p_dst_count, 0, input_size * sizeof(int), ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
index 0dbecef256c469..5703b5faea07c5 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
@@ -162,12 +162,6 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)    
-        musaMemcpyAsync(x_grad,
-                        x_grad_out.data<T>(),
-                        x_grad_out.numel() * sizeof(T),
-                        musaMemcpyDeviceToDevice,
-                        ctx.stream());   
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -246,12 +240,6 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)     
-        musaMemcpyAsync(x_grad,
-                        x_grad_out.data<T>(),
-                        x_grad_out.numel() * sizeof(T),
-                        musaMemcpyDeviceToDevice,
-                        ctx.stream());             
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -299,12 +287,6 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)      
-        musaMemcpyAsync(x_grad,
-                        x_grad_out.data<T>(),
-                        x_grad_out.numel() * sizeof(T),
-                        musaMemcpyDeviceToDevice,
-                        ctx.stream());            
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -375,12 +357,6 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)             
-        musaMemcpyAsync(x_grad,
-                        x_grad_out.data<T>(),
-                        x_grad_out.numel() * sizeof(T),
-                        musaMemcpyDeviceToDevice,
-                        ctx.stream());     
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -517,9 +493,6 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper(
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(e_grad_data, 0, memset_bytes_e);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
-  musaMemsetAsync(e_grad_data, 0, memset_bytes_e, ctx.stream());
 #else
   cudaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
   cudaMemsetAsync(e_grad_data, 0, memset_bytes_e, ctx.stream());
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index d25e7e2e474270..c87f133d07b8d8 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -144,9 +144,6 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
       int* dst_count_data = dst_count->data<int>();
 #ifdef PADDLE_WITH_HIP
       hipMemset(dst_count_data, 0, input_size * sizeof(int));
-#elif defined(PADDLE_WITH_MUSA)
-      musaMemsetAsync(
-          dst_count_data, 0, input_size * sizeof(int), ctx.stream());
 #else
       cudaMemsetAsync(
           dst_count_data, 0, input_size * sizeof(int), ctx.stream());
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
index a041588c076e6c..bc61ae766d6c24 100644
--- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -116,12 +116,6 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-      musaMemcpyAsync(x_grad,
-                      x_grad_out.data<T>(),
-                      x_grad_out.numel() * sizeof(T),
-                      musaMemcpyDeviceToDevice,
-                      ctx.stream());                
 #else
       cudaMemcpyAsync(x_grad,
                       x_grad_out.data<T>(),
@@ -205,12 +199,6 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-      musaMemcpyAsync(x_grad,
-                x_grad_out.data<T>(),
-                x_grad_out.numel() * sizeof(T),
-                musaMemcpyDeviceToDevice,
-                ctx.stream());        
 #else
       cudaMemcpyAsync(x_grad,
                       x_grad_out.data<T>(),
@@ -261,9 +249,6 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(y_grad_data, 0, memset_bytes_y);
-#elif defined(PADDLE_WITH_MUSA)
-  musaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
-  musaMemsetAsync(y_grad_data, 0, memset_bytes_y, ctx.stream());
 #else
   cudaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
   cudaMemsetAsync(y_grad_data, 0, memset_bytes_y, ctx.stream());
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
index 66688b417ae307..42ff5b912eccd0 100644
--- a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -35,3 +35,20 @@ PD_REGISTER_KERNEL(set_value_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
+PD_REGISTER_KERNEL(set_value_with_scalar_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SetValueWithScalarGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index d86b7cfddcd78a..d36687461bf6c1 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -197,22 +197,6 @@ PD_REGISTER_KERNEL(sgd,
 }
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-PD_REGISTER_KERNEL(sgd,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::SGDDenseKernel,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16,
-                   float,
-                   double) {
-  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
-      kernel_key.dtype() == phi::DataType::BFLOAT16) {
-    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
-  }
-}
-#endif
-
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sgd,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
index 33770fa691ac14..9472861a64c8e3 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifndef _MSC_VER
 #include <thrust/device_ptr.h>
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
index 40beb945267104..8be83b12d20d43 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #ifndef _MSC_VER
 #include <thrust/device_ptr.h>
@@ -77,7 +77,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
                                                              dev_ctx.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream());
 #else
-  const auto& exec_policy = thrust::musa::par.on(dev_ctx.stream());
+  const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
   thrust::random::default_random_engine engine(seed_int);
   thrust::counting_iterator<int64_t> cnt_iter(0);
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_utils.h b/paddle/phi/kernels/gpu/shuffle_batch_utils.h
index 2095bf35a32233..3a7c2230d3213b 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_utils.h
+++ b/paddle/phi/kernels/gpu/shuffle_batch_utils.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index 55268cb010ad69..dc6d8312e06c7e 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -25,8 +25,7 @@
 #include "paddle/phi/kernels/funcs/math.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 
-#if defined(__NVCC__) || defined(__MUSACC__)
-
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/strided_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
index 128c0b865c66a3..64cd37cd14b638 100644
--- a/paddle/phi/kernels/gpu/strided_copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
@@ -32,7 +32,7 @@ __global__ void StridedCopyCaseZeroFunc(
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride) {
   int64_t input_offset = 0;
   int64_t output_offset = 0;
-#if defined(PADDLE_WITH_HIP) ||  defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   int64_t coordinate[6] = {threadIdx.x,
                            threadIdx.y,
                            threadIdx.z,
@@ -467,7 +467,7 @@ __global__ void Strided2ContiguousCaseZeroFunc(
                               blockDim.z * blockDim.y * blockDim.x +
                           threadIdx.z * blockDim.y * blockDim.x +
                           threadIdx.y * blockDim.x + threadIdx.x;
-#if defined(PADDLE_WITH_HIP) || defined (PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   int64_t coordinate[6] = {threadIdx.x,
                            threadIdx.y,
                            threadIdx.z,
@@ -881,7 +881,7 @@ __global__ void Contiguous2StridedCaseZeroFunc(
                          threadIdx.z * blockDim.y * blockDim.x +
                          threadIdx.y * blockDim.x + threadIdx.x;
   int64_t output_offset = 0;
-#if defined(PADDLE_WITH_HIP) || defined (PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   int64_t coordinate[6] = {threadIdx.x,
                            threadIdx.y,
                            threadIdx.z,
@@ -1339,11 +1339,6 @@ void StridedCopyKernel(const Context& dev_ctx,
               input_data,
               phi::SizeOf(input.dtype()),
               hipMemcpyDeviceToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemcpy(output_data,
-               input_data,
-               phi::SizeOf(input.dtype()),
-               musaMemcpyDeviceToDevice);          
 #else
     cudaMemcpy(output_data,
                input_data,
diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu
index 6cebc883aeb403..5f076850d438f2 100644
--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/svd_kernel.h"
diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
index 6cea7592836730..5993b11f638db2 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu
@@ -46,10 +46,11 @@ void TakeAlongAxisGradKernel(const Context& dev_ctx,
         axis,
         index,
         out_grad,
+        true,
         dev_ctx);  // the gradient of gather is scatter
   } else if (index_type == DataType::INT64) {
     phi::funcs::gpu_scatter_add_kernel<T, int64_t>(
-        *x_grad, axis, index, out_grad, dev_ctx);
+        *x_grad, axis, index, out_grad, true, dev_ctx);
   } else {
     PADDLE_THROW(
         phi::errors::InvalidArgument("The data type of input index is expected "
diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
index ba4c6ba27e6824..ea32c056d4016a 100644
--- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu
@@ -33,9 +33,11 @@ void TakeAlongAxisKernel(const Context& dev_ctx,
 
   const auto& index_type = index.dtype();
   if (index_type == DataType::INT32) {
-    phi::funcs::gpu_gather_kernel<T, int32_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::gpu_gather_kernel<T, int32_t>(
+        x, axis, index, *out, true, dev_ctx);
   } else if (index_type == DataType::INT64) {
-    phi::funcs::gpu_gather_kernel<T, int64_t>(x, axis, index, *out, dev_ctx);
+    phi::funcs::gpu_gather_kernel<T, int64_t>(
+        x, axis, index, *out, true, dev_ctx);
   } else {
     PADDLE_THROW(
         phi::errors::InvalidArgument("The data type of input index is expected "
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
index 6a4ad710fff0e7..528d3d07ad7849 100644
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -20,10 +20,6 @@
 #include <hiprand_kernel.h>
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
-#elif defined(PADDLE_WITH_MUSA)
-#include <musa_fp16.h>
-#include <murand_kernel.h>
-#include <cub/cub.cuh>
 #else
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
@@ -139,15 +135,6 @@ __global__ void setup_kernel(hiprandState_t* state,
     hiprand_init(seed, i, 0, &state[i]);
   }
 }
-#elif defined(PADDLE_WITH_MUSA)
-__global__ void setup_kernel(murandState_t* state,
-                             const uint64_t seed,
-                             const int bs) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = idx; i < bs; i += gridDim.x * blockDim.x) {
-    murand_init(seed, i, 0, &state[i]);
-  }
-}
 #else
 __global__ void setup_kernel(curandState_t* state,
                              const uint64_t seed,
@@ -303,7 +290,7 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
       if (*beam >= MaxLength) break;
     } else {
 #ifdef PADDLE_WITH_HIP
-      uint64 mask = 0;
+      unsigned mask = 0u;
       mask = __ballot(true);
       if (tid_max / WARP_SIZE == wid) {
         if (__shfl_down(*beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength)
@@ -331,8 +318,6 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
                                      int vocab_size,
 #ifdef PADDLE_WITH_HIP
                                      hiprandState_t* state,
-#elif defined(PADDLE_WITH_MUSA)
-                                     murandState_t* state,
 #else
                                      curandState_t* state,
 #endif
@@ -383,8 +368,6 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
     count_iter_begin[bid] = count_iter[bid];
 #ifdef PADDLE_WITH_HIP
     float rand_top_p = hiprand_uniform(state + bid) * top_p_num;
-#elif defined(PADDLE_WITH_MUSA)
-    float rand_top_p = murand_uniform(state + bid) * top_p_num;
 #else
     float rand_top_p = curand_uniform(state + bid) * top_p_num;
 #endif
@@ -583,10 +566,6 @@ __global__ void topp_sampling(T* sorted_probs,
         hiprandStatePhilox4_32_10_t rng;
         hiprand_init(seed, tid, 0, &rng);
         int random_id = hiprand(&rng) % (max_id + 1);
-#elif defined(PADDLE_WITH_MUSA)
-        murandStatePhilox4_32_10_t rng;
-        murand_init(seed, tid, 0, &rng);
-        int random_id = murand(&rng) % (max_id + 1);
 #else
         curandStatePhilox4_32_10_t rng;
         curand_init(seed, tid, 0, &rng);
@@ -620,7 +599,7 @@ __global__ void set_sorted_num(int* need_sorted_num, int bs) {
   *need_sorted_num = bs;
 }
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 template <typename T>
 __global__ void print_kernel(T* input, int size) {
   for (int i = 0; i < size; i++) {
@@ -718,15 +697,6 @@ void TopPSamplingKernel(const Context& dev_ctx,
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   dev_curand_states =
       reinterpret_cast<hiprandState_t*>(curand_states_buf->ptr());
-#elif defined(PADDLE_WITH_MUSA)
-  murandState_t* dev_curand_states;
-  phi::Allocator::AllocationPtr curand_states_buf{nullptr};
-  curand_states_buf = phi::memory_utils::Alloc(
-      dev_ctx.GetPlace(),
-      bs * sizeof(murandState_t),
-      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-  dev_curand_states =
-      reinterpret_cast<murandState_t*>(curand_states_buf->ptr());      
 #else
   curandState_t* dev_curand_states;
   phi::Allocator::AllocationPtr curand_states_buf{nullptr};
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index e3c6f1bd4c9ef7..b408c5b2dd5b0d 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -26,7 +26,12 @@
 #include <iostream>
 #include <vector>
 
+#ifdef PADDLE_WITH_CUDA
 #include "cub/cub.cuh"
+#else
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -131,7 +136,7 @@ UniqueFlattendCUDATensor(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::musa::par.on(context.stream());
+  const auto& exec_policy = thrust::hip::par.on(context.stream());
 #endif
 
   thrust::sequence(exec_policy, indices_data, indices_data + num_input);
@@ -167,11 +172,12 @@ UniqueFlattendCUDATensor(const Context& context,
                                 not_equal);
 #ifdef PADDLE_WITH_HIP
     hipMemset(inv_loc_data_ptr, 0, sizeof(IndexT));
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream());
 #else
-    cudaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream());
+    thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
+    inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
 #endif
+
+#ifdef PADDLE_WITH_HIP
     size_t temp_storage_bytes = 0;
     cub::DeviceScan::InclusiveSum(NULL,
                                   temp_storage_bytes,
@@ -187,6 +193,12 @@ UniqueFlattendCUDATensor(const Context& context,
                                   inv_loc_data_ptr,
                                   num_input,
                                   context.stream());
+#else
+    thrust::inclusive_scan(exec_policy,
+                           inv_loc_data_ptr,
+                           inv_loc_data_ptr + num_input,
+                           inv_loc_data_ptr);
+#endif
     thrust::scatter(exec_policy,
                     inv_loc_data_ptr,
                     inv_loc_data_ptr + num_input,
@@ -257,7 +269,7 @@ UniqueFlattendCUDATensor(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::musa::par.on(context.stream());
+  const auto& exec_policy = thrust::hip::par.on(context.stream());
 #endif
   thrust::sequence(exec_policy, indices_data, indices_data + num_input);
   thrust::sort(exec_policy,
@@ -347,7 +359,7 @@ static void ComputeUniqueDims(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::musa::par.on(context.stream());
+  const auto& exec_policy = thrust::hip::par.on(context.stream());
 #endif
   // 1. inverse indices: 'inverse'
   inverse->Resize(common::make_ddim({row}));
@@ -390,9 +402,11 @@ static void ComputeUniqueDims(const Context& context,
   // 3. counts: 'counts'
   counts->Resize(common::make_ddim({num_out}));
   auto* count_data = context.template Alloc<IndexT>(counts);
-  thrust::fill(exec_policy, count_data, count_data + row, 0);
-  thrust::adjacent_difference(
-      exec_policy, range_data_ptr + 1, range_data_ptr + row + 1, count_data);
+  thrust::fill(exec_policy, count_data, count_data + num_out, 0);
+  thrust::adjacent_difference(exec_policy,
+                              range_data_ptr + 1,
+                              range_data_ptr + num_out + 1,
+                              count_data);
 }
 
 // Calculate unique when 'axis' is set
@@ -451,7 +465,7 @@ static void UniqueDimsCUDATensor(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::musa::par.on(context.stream());
+  const auto& exec_policy = thrust::hip::par.on(context.stream());
 #endif
   thrust::sequence(exec_policy, sorted_indices_data, sorted_indices_data + row);
   thrust::sort(exec_policy,
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index 947fa68cfde52d..f813223c2ce311 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/viterbi_decode_kernel.h"
 
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
index 1bc590e852555a..d4e0ca632e04de 100644
--- a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
@@ -25,12 +25,6 @@
 #include "cub/cub.cuh"
 #endif
 
-#ifdef PADDLE_WITH_MUSA
-#include <musa_runtime.h>
-#include <murand_kernel.h>
-#include "cub/cub.cuh"
-#endif
-
 #include "math.h"  // NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
index d8b9762accc5b1..2a3c9515ac2ea7 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
 #include "paddle/phi/backends/all_context.h"
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
index 9d11b4149415af..bde4faefc5de3f 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+#ifndef PADDLE_WITH_HIP
 
 #include "paddle/phi/kernels/affine_grid_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 46174ac75ce076..9c3afd188ad3c3 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -1051,16 +1051,6 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
       out_data,
       algo,
       mode));
-#elif defined(PADDLE_WITH_MUSA)
-  auto& idesc = scoped_desc.descriptor<T>(x_data, layout, tensor_dims);
-  ScopedTensorDescriptor out_scoped_desc;
-  auto& odesc = out_scoped_desc.descriptor<T>(out_data, layout, tensor_dims);
-  backends::gpu::ScopedSoftmaxDescriptor softmax_desc;
-  auto mode = log_mode ? dynload::Softmax::Mode::LOGSOFTMAX
-                       : dynload::Softmax::Mode::SOFTMAX;
-  auto algo = log_mode ? dynload::Softmax::Algorithm::DIRECT
-                       : dynload::Softmax::Algorithm::ACCURATE;
-  softmax_desc.descriptor(mode, algo, axis).Run(*handle, odesc, idesc);      
 #else
   cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
   auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
@@ -1135,8 +1125,6 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
       dx_data,
       algo,
       mode));
-#elif defined(PADDLE_WITH_MUSA)
-  //      
 #else
   cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
   auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 844a4e397aa077..20cc162b7db554 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -41,7 +41,7 @@ void SoftmaxGradGPUDNNKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
index 9ea183071d4ed3..e62468b7fb1670 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -40,7 +40,7 @@ void SoftmaxGPUDNNKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
index fb661781c18713..9acdeca0e67478 100644
--- a/paddle/phi/kernels/group_norm_kernel.h
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -38,7 +38,7 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T, typename AccT = T>
 class GroupNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
index 10de97df12a97b..821b065d2883ac 100644
--- a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -47,7 +47,7 @@ void ClipGradKernel(const Context& dev_ctx,
   auto max_ = max.to<T>();
   auto min_ = min.to<T>();
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   std::vector<const DenseTensor*> ins = {&out_grad, &x};
   std::vector<DenseTensor*> outs = {x_grad};
   auto functor = ClipGradFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/clip_kernel_impl.h b/paddle/phi/kernels/impl/clip_kernel_impl.h
index dde2ef9ebeccea..7d327ef5c5dfaf 100644
--- a/paddle/phi/kernels/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -60,7 +60,7 @@ void ClipKernel(const Context& dev_ctx,
   const T* x_data = x.data<T>();
   int64_t numel = x.numel();
   if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     std::vector<const DenseTensor*> ins = {&x};
     std::vector<DenseTensor*> outs = {out};
     auto functor = ClipFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index 52c0b634a61117..ebbbda04a01c20 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -88,7 +88,7 @@ void ComplexKernel(const Context& dev_ctx,
 
 // NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related
 // facility functions
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
       dev_ctx, x, y, RealAndImagToComplexFunctor<T>(), out);
 #else
diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h
index 53582077b3326c..044deccb3c2c35 100644
--- a/paddle/phi/kernels/impl/diag_embed_impl.h
+++ b/paddle/phi/kernels/impl/diag_embed_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -105,7 +105,7 @@ void DiagEmbedKernel(const Context& dev_ctx,
   strides.push_back(stride[dim1_] + stride[dim2_]);
   const auto dims = common::vectorize(x.dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   thrust::device_vector<int64_t> dims_vec(dims);
   const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data());
   thrust::device_vector<int64_t> strides_vec(strides);
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index 330ea8e63f1a7c..3a82ace22860e5 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -45,7 +45,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
     VLOG(1) << "enable route";
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
 
@@ -143,7 +143,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   const DenseTensor* tensor_dout,
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
       if (tensor_dx) {
@@ -235,7 +235,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 >= tensor_dout->dims().size()) {
       DenseTensor tensor_dout_help;
       auto& dev = *ctx.eigen_device();
@@ -430,7 +430,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 >= tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       auto x = EigenVector<T>::Flatten(*tensor_x);
@@ -620,7 +620,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       DenseTensor in_tensor_x_help = Conj<T, DeviceContext>(ctx, *in_tensor_x);
@@ -1014,7 +1014,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       bool d_dout_flag = false;
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 3989c9da0ed3be..280c38633b4626 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -903,7 +903,7 @@ void HeavisideGradKernel(const Context& dev_ctx,
           HeavisideGradDy<T>());
 }
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
 template <typename T, typename MPType>
 HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow_grad_dx(T x, T y, T out, T dout) {
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index e4624b35c4b8ab..137829b5193f24 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
index 2f861e17968b07..72c8bc659a632a 100644
--- a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -94,7 +94,7 @@ void FFTC2RGradKernel(const Context& ctx,
       out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()];
   const phi::DDim strides = common::stride(x_grad->dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   const thrust::device_vector<int64_t> strides_g(common::vectorize(strides));
   const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
 #else
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
index 1a5a2b3bbccdb4..93dfb7790b4abd 100644
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -121,7 +121,7 @@ struct IscloseFunctor<phi::CPUContext, phi::dtype::complex<T>> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 __global__ void IscloseCUDAKernel(const T* in_data,
                                   const T* other_data,
@@ -232,8 +232,6 @@ struct IscloseFunctor<phi::GPUContext, T> {
     grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
     hipMemset(out_data, true, num * sizeof(bool));
-#elif defined(PADDLE_WITH_MUSA)
-    musaMemset(out_data, true, num * sizeof(bool));
 #else
     cudaMemset(out_data, true, num * sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
index 4907ad4709215e..3b195d6fa8b0ad 100644
--- a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -178,7 +178,7 @@ struct KronGradOpFunctor {
     const int64_t *p_stride_y = nullptr;
     const int64_t *p_stride_dout = nullptr;
     const int64_t *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_dout(ndims);
@@ -232,7 +232,7 @@ struct KronGradOpFunctor {
     for_range(func);
 
 // reduce_sum along aixs 1
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     auto stream = dev_ctx.stream();  // it is a cuda device_context
     if (dx) {
       phi::SumKernel<T, Context>(
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
index 02b96170508557..e90c45c01879fc 100644
--- a/paddle/phi/kernels/impl/kron_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -20,7 +20,7 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "thrust/device_vector.h"
 #endif
@@ -117,7 +117,7 @@ struct KronOpFunctor {
 
     const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
                   *p_stride_out = nullptr, *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_out(ndims);
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 26463aadfbf132..40ff69c50f1d7f 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -54,7 +54,7 @@ struct ReduceSumForMatmulGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 struct ReduceSumForMatmulGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/polygamma_kernel_impl.h b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
index 8849014411650d..8b4274b0882c84 100644
--- a/paddle/phi/kernels/impl/polygamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #else
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 __host__ __device__ T zeta(T x, T q) {
   /*
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
index 00e5a08e69d0ea..dc0b7ad2108ac5 100644
--- a/paddle/phi/kernels/impl/pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
-#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #endif
@@ -114,7 +114,7 @@ void PoolRawKernel(const Context& ctx,
         int reduce_num = GetReduceNum(x, out, channel_last, &reduce_dim);
         if (reduce_num > 0 &&
             adaptive) {  // for adaptive_avg_pool2d && output_size == 1
-#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__NVCC__)
+#if defined(__HIPCC__) || defined(__NVCC__)
           auto stream = ctx.stream();
           funcs::ReduceKernel<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
               ctx, x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim);
diff --git a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
index e6a4180b7bb9eb..1948c928733e64 100644
--- a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
+++ b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
@@ -76,7 +76,7 @@ void QuantLinearKernel(const Context& dev_ctx,
           "The weight's datatype is expected to be int8 when use quant. But "
           "received weight's datatype is %d",
           static_cast<int>(w.dtype())));
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#ifdef PADDLE_WITH_HIP
   PADDLE_THROW(
       phi::errors::Unimplemented("FCInt8Functor not surpport for rocm"));
 #else
diff --git a/paddle/phi/kernels/impl/renorm_impl.h b/paddle/phi/kernels/impl/renorm_impl.h
index d49cc520058fd1..409c0a5c4e1f31 100644
--- a/paddle/phi/kernels/impl/renorm_impl.h
+++ b/paddle/phi/kernels/impl/renorm_impl.h
@@ -17,11 +17,11 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
@@ -150,7 +150,7 @@ void RenormGradFunc(const phi::CPUContext& ctx UNUSED,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index 7a1e01ae410165..d8c56000639bbc 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -18,10 +18,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
-#if defined(__NVCC__) || defined(__MUSACC__)
+#ifdef __NVCC__
 #include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
@@ -33,7 +33,7 @@ namespace cub = hipcub;
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
@@ -104,7 +104,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
                         DataTypeToString(index_type),
                         DataTypeToString(DataType::INT32),
                         DataTypeToString(DataType::INT64)));
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 
   auto output_dim = out_grad.dims();
   auto stride_dim = common::stride(input_dim);
@@ -179,7 +179,7 @@ void RepeatInterleaveGradKernel(const Context& ctx,
   }
 
   DenseTensor index;
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   auto output_dim = out_grad.dims();
   auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
index eed9acdb85cf32..05f1bba3c0ea68 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -29,7 +29,7 @@
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename IndexT>
 __global__ void index_select_cuda_kernel(const T* input,
@@ -86,7 +86,7 @@ void RepeatInterleaveKernel(const Context& ctx,
     output_dim[dim] = index_size;
     out->Resize(common::make_ddim(output_dim));
     phi::IndexSelectInner<Context, T, int>(ctx, &x_copy, index, out, dim);
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   } else {
     auto stride_dim = common::stride(input_dim);
     int64_t stride = stride_dim[dim];
@@ -165,7 +165,7 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
       out->Resize(common::make_ddim(output_dim));
       IndexSelectInner<Context, T, int64_t>(ctx, &x_copy, index, out, dim);
     }
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   } else {
     auto stride_dim = common::stride(input_dim);
     int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 37644334cbb94f..3b6f9998a00129 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -64,7 +64,7 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(dev_ctx, out, static_cast<T>(0));
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (!cpu_place) {
     DenseTensor length;
     length.Resize(common::make_ddim({1}));
@@ -77,11 +77,6 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
                                          segment_ids_ptr + num_indices - 1,
                                          sizeof(IndexT),
                                          hipMemcpyDeviceToHost));
-#elif defined(PADDLE_WITH_MUSA)
-    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(length_data,
-                                         segment_ids_ptr + num_indices - 1,
-                                         sizeof(IndexT),
-                                         musaMemcpyDeviceToHost));                                         
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
                                           segment_ids_ptr + num_indices - 1,
diff --git a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
index 77088f0a42748c..20fc0bda1f9184 100644
--- a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
+++ b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
@@ -41,7 +41,7 @@ void SequenceMaskScalarKernel(const Context& ctx,
     if (x_numel == 0) {
       maxlen = 0;
     } else {
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
       VLOG(10)
           << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
       maxlen = static_cast<int>(
diff --git a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
index 3f78361b92b8bd..99f05f80c17ff7 100644
--- a/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/set_value_grad_kernel_impl.h
@@ -341,4 +341,26 @@ void SetValueGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void SetValueWithScalarGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& out_grad,
+                                  const IntArray& starts,
+                                  const IntArray& ends,
+                                  const IntArray& steps,
+                                  const std::vector<int64_t>& axes,
+                                  const std::vector<int64_t>& decrease_axes,
+                                  const std::vector<int64_t>& none_axes,
+                                  DenseTensor* x_grad) {
+  SetValueGradKernel<T, Context>(dev_ctx,
+                                 out_grad,
+                                 starts,
+                                 ends,
+                                 steps,
+                                 axes,
+                                 decrease_axes,
+                                 none_axes,
+                                 x_grad,
+                                 nullptr);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
index 61d8c8c189df46..fa25f2a0887972 100644
--- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/squeeze_kernel.h"
 #include "paddle/phi/kernels/unsqueeze_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -56,7 +56,7 @@ struct ReduceSumForSolvelGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 struct ReduceSumForSolvelGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
index 8910cadb4ab25b..964d5871bf9319 100644
--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -120,7 +120,7 @@ void TraceGradKernel(const Context& ctx,
   int64_t diag_size = len2 < len1 ? len2 : len1;
   int64_t pos = std::abs(offset) * offset_stride;
   if (diag_size > 0) {
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
     thrust::device_vector<int64_t> output_vec(common::vectorize(output_stride));
     const int64_t* output_arr = thrust::raw_pointer_cast(output_vec.data());
     thrust::device_vector<int64_t> input_vec(common::vectorize(input_stride));
diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
index dacaad9032f343..0576742e349a83 100644
--- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #endif
 namespace phi {
@@ -39,7 +39,7 @@ void UnStackGradKernel(const Context &dev_ctx,
   for (auto i = 0; i < axis; ++i) pre *= dim[i];
   for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   int total_num = pre * n * post;
 
   thrust::device_vector<const T *> device_x_vec(x_datas);
diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h
index 9d9a59ea8cbdcf..102126a1e3307f 100644
--- a/paddle/phi/kernels/impl/unstack_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #endif
 
@@ -44,7 +44,7 @@ void UnStackKernel(const Context &dev_ctx,
   int total_num = dy->numel();
   int post = total_num / (n * pre);
 
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   thrust::device_vector<T *> device_dx_vec(dx_datas);
   auto dx_data_arr = device_dx_vec.data().get();
 #else
@@ -52,7 +52,7 @@ void UnStackKernel(const Context &dev_ctx,
 #endif
   phi::funcs::StackGradFunctorForRange(
       dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
   // Wait() must be called because device_dx_vec may be destructed before
   // kernel ends
   dev_ctx.Wait();
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index d812290aed5731..80ccf6e21b5377 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -139,7 +139,7 @@ class WarpRNNTFunctor {
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gpu = true;
 #else
       PADDLE_THROW(errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
index 5dafe445e2b461..201dd403270f36 100644
--- a/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
+++ b/paddle/phi/kernels/impl/weight_quantize_kernel_gpu_impl.h
@@ -105,6 +105,7 @@ void weight_permute_gpu(const GPUContext& dev_ctx,
         input_data, output_data, numel, total_k, total_n);
   }
 }
+
 template <typename T, int VectorSize = 8>
 __global__ void per_channel_quant_gpu(const T* weight_data,
                                       int8_t* quanted_weight_data,
@@ -160,7 +161,6 @@ __global__ void per_channel_quant_gpu(const T* weight_data,
     }
   }
 }
-
 template <typename T, typename GPUContext>
 void weight_quant_gpu(const GPUContext& dev_ctx,
                       const T* weight_data,
@@ -174,8 +174,15 @@ void weight_quant_gpu(const GPUContext& dev_ctx,
   constexpr int kBlockSize = 64;
   constexpr int kWarpNum = kBlockSize / kWarpSize;
   constexpr int kVectorSize = 128 / sizeof(T) / 8;
+  PADDLE_ENFORCE_EQ(total_n % kVectorSize,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "Currently, weight_quant_gpu kernel only support n "
+                        "with multiple of %d, please use",
+                        kVectorSize));
   int vec_total_n = total_n / kVectorSize;
-  int kGridSize = max(vec_total_n / kBlockSize, static_cast<int>(1));
+  int kGridSize =
+      max((vec_total_n + kBlockSize - 1) / kBlockSize, static_cast<int>(1));
   per_channel_quant_gpu<T, kVectorSize><<<kGridSize, kBlockSize>>>(
       weight_data, quanted_weight_data, scale_data, total_k, vec_total_n);
 }
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index 78754083014660..dadaa2132e95ed 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(is_empty,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(is_empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 12f06c21dc9415..6de33dd78d2d00 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -164,7 +164,7 @@ void ElementwisePowKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 PD_REGISTER_KERNEL(maximum,
                    KPS,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index ee8a324e09b4f5..2fddcec2278c9a 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -30,7 +30,7 @@ void LayerNormKernel(const Context& ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 template <typename T, typename U>
 class LayerNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index 62a6cbc8ea840e..49d69a23fedd12 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -117,7 +117,7 @@ void MemcpyKernel(const Context& dev_ctx,
       dev_ctx.HostAlloc(out, out->dtype());
       Copy(dev_ctx, x, CPUPlace(), true, out);
       break;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case 1: /* CUDAPlace */
       dev_ctx.Alloc(out, x.dtype());
       Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
@@ -162,7 +162,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 12d933af787337..89a0c63c8a4959 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -62,7 +62,7 @@ PD_REGISTER_KERNEL(npu_identity,
                    bool,
                    phi::dtype::float16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(npu_identity,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 4672a228a12168..30c2636a2bde91 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -17,12 +17,6 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_fp16.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa_fp16.h>
-#endif
-
-
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index d953e06bd2670e..a78045aa0dc7ca 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -20,11 +20,6 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include <musa_fp16.h>
-#endif
-
 #include "paddle/common/ddim.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc
index 4e5546ca0df01c..ea3faaebd95829 100644
--- a/paddle/phi/kernels/prod_kernel.cc
+++ b/paddle/phi/kernels/prod_kernel.cc
@@ -40,7 +40,7 @@ PD_REGISTER_KERNEL(prod_infer,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(prod_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/put_along_axis_grad_kernel.h b/paddle/phi/kernels/put_along_axis_grad_kernel.h
index 2141443da7ab17..07c39941ce8d83 100644
--- a/paddle/phi/kernels/put_along_axis_grad_kernel.h
+++ b/paddle/phi/kernels/put_along_axis_grad_kernel.h
@@ -24,9 +24,12 @@ template <typename T, typename Context>
 void PutAlongAxisGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& index,
+                            const DenseTensor& value,
+                            const DenseTensor& out,
                             const DenseTensor& out_grad,
                             int axis,
                             const std::string& reduce,
+                            bool include_self,
                             DenseTensor* x_grad,
                             DenseTensor* value_grad);
 
diff --git a/paddle/phi/kernels/put_along_axis_kernel.h b/paddle/phi/kernels/put_along_axis_kernel.h
index 797d0e364b48d4..c1cb13e607dd6e 100644
--- a/paddle/phi/kernels/put_along_axis_kernel.h
+++ b/paddle/phi/kernels/put_along_axis_kernel.h
@@ -27,6 +27,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
                         const DenseTensor& value,
                         int axis,
                         const std::string& reduce,
+                        bool include_self,
                         DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index db09ad92fab6e9..d6f88a596af3ac 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     all, GPU, ALL_LAYOUT, phi::AllKernel, float, double, int, int64_t, bool) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
index 466d0497b2d8ee..87e432c5c20a7b 100644
--- a/paddle/phi/kernels/reduce_amax_kernel.cc
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,7 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
index a30ab4a91956dd..a355da64230dcb 100644
--- a/paddle/phi/kernels/reduce_amin_kernel.cc
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,7 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 4de7e3efc8e0a2..076aacfa3ed82c 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -36,7 +36,7 @@ PD_REGISTER_KERNEL(
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     any, GPU, ALL_LAYOUT, phi::AnyKernel, float, double, int, int64_t, bool) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index b08bfd04436ad7..3ce0380f3a36a7 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index 0d7a7a4fa50b87..460bd361b3987c 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -39,7 +39,7 @@ void MinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA)
 PD_REGISTER_KERNEL(min,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index 1995bb2fc1d7bc..4f7b3a4a8c7ed9 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index d8c8f5a9663763..771acacedf0243 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -61,7 +61,7 @@ PD_REGISTER_KERNEL(reverse_array,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 PD_REGISTER_KERNEL(reverse_array,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 6bd55f701bb33d..4a27d0763a235c 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 481f5f6fcf8521..081d85e68c959f 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 84155e03ab5d23..74d2bdc0a673fa 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -84,7 +84,7 @@ PD_REGISTER_KERNEL(multiply_sr,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(multiply_raw_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 35f30df7613195..0a3b3ae62fe63d 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/common/bfloat16.h"
@@ -64,7 +64,7 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
@@ -114,7 +114,7 @@ PD_REGISTER_KERNEL(full_with_tensor_sr,
   kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(full_with_tensor_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
index fc6048657c8099..d68688a7e400a1 100644
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index 135eb3b8b04d90..a5d2e667873168 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index ad7ead679e462e..38a0cb75101b7e 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index 859f8f1399db5b..f44a6a8dfafc50 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,7 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
index d6b422c1325951..0af5d8788c71f7 100644
--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,7 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,
diff --git a/paddle/phi/kernels/set_value_grad_kernel.h b/paddle/phi/kernels/set_value_grad_kernel.h
index e4dad683e40a9d..04592cd2002d19 100644
--- a/paddle/phi/kernels/set_value_grad_kernel.h
+++ b/paddle/phi/kernels/set_value_grad_kernel.h
@@ -32,4 +32,14 @@ void SetValueGradKernel(const Context& dev_ctx,
                         DenseTensor* x_grad,
                         DenseTensor* value_grad);
 
+template <typename T, typename Context>
+void SetValueWithScalarGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& out_grad,
+                                  const IntArray& starts,
+                                  const IntArray& ends,
+                                  const IntArray& steps,
+                                  const std::vector<int64_t>& axes,
+                                  const std::vector<int64_t>& decrease_axes,
+                                  const std::vector<int64_t>& none_axes,
+                                  DenseTensor* x_grad);
 }  // namespace phi
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index e7556d14019549..c4190a5f59b62a 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(shape,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
index ba1672dbf453f9..aeb09b3fc7c981 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
@@ -216,9 +216,6 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
-#elif defined(PADDLE_WITH_MUSA)
-  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
-  bool is_same_offset = thrust::equal(thrust::musa::par.on(dev_ctx.stream()),
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 6134004c1bbba0..084cb0e60bb6de 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -20,11 +20,6 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #endif
-
-#ifdef PADDLE_WITH_MUSA
-#include "paddle/phi/backends/dynload/musparse.h"
-#endif
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/enforce.h"
@@ -137,8 +132,6 @@ void DenseToCooKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
-#elif defined(PADDLE_WITH_MUSA)
-  thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index 89d1a919b55f11..d39bd0c4952b4c 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index 8441c122ddd872..684fd0298a3df3 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -78,7 +78,7 @@ PD_REGISTER_KERNEL(squeeze,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(squeeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/stride/as_complex_kernel.cc b/paddle/phi/kernels/stride/as_complex_kernel.cc
index 2e37baddbad284..c2e7f816958ebd 100644
--- a/paddle/phi/kernels/stride/as_complex_kernel.cc
+++ b/paddle/phi/kernels/stride/as_complex_kernel.cc
@@ -45,7 +45,7 @@ PD_REGISTER_KERNEL(
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(
     as_complex, GPU, STRIDED, phi::AsComplexStridedKernel, float, double) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
diff --git a/paddle/phi/kernels/stride/as_real_kernel.cc b/paddle/phi/kernels/stride/as_real_kernel.cc
index 78977558935629..92357968809cec 100644
--- a/paddle/phi/kernels/stride/as_real_kernel.cc
+++ b/paddle/phi/kernels/stride/as_real_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(as_real,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(as_real,
                    GPU,
                    STRIDED,
diff --git a/paddle/phi/kernels/stride/complex_grad_kernel.cc b/paddle/phi/kernels/stride/complex_grad_kernel.cc
index f268adfcc23e4b..800e484ea7eb88 100644
--- a/paddle/phi/kernels/stride/complex_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/complex_grad_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(imag_grad,
   kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(real_grad,
                    GPU,
                    STRIDED,
diff --git a/paddle/phi/kernels/stride/complex_kernel.cc b/paddle/phi/kernels/stride/complex_kernel.cc
index b43d511c572367..d72bfec2b09f07 100644
--- a/paddle/phi/kernels/stride/complex_kernel.cc
+++ b/paddle/phi/kernels/stride/complex_kernel.cc
@@ -78,7 +78,7 @@ PD_REGISTER_KERNEL(imag,
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(real,
                    GPU,
                    STRIDED,
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index dd5bd42a3f48ac..7582f751bf16a5 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(strided_slice_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 6b8b78557fa021..0852cc8830e2c0 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -50,7 +50,7 @@ PD_REGISTER_KERNEL(strided_slice,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(strided_slice,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/case_utils.h b/paddle/phi/kernels/strings/case_utils.h
index cad57e8a8849e1..66744c6915bc67 100644
--- a/paddle/phi/kernels/strings/case_utils.h
+++ b/paddle/phi/kernels/strings/case_utils.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/kernels/strings/unicode.h"
-#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index 3c1acf1b80c37e..6e413ef73098dd 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace phi {
 namespace strings {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 __global__ void SerializeStringsData(const phi::dtype::pstring* src_str,
                                      uint8_t* strings_data,
                                      int32_t* strings_offset,
@@ -83,9 +83,6 @@ int GetAllStringsSize(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), hipMemcpyDeviceToHost, dev_ctx.stream());
-#elif defined(PADDLE_WITH_MUSA)
-  phi::backends::gpu::GpuMemcpyAsync(
-      &num, nums_ptr, sizeof(int), musaMemcpyDeviceToHost, dev_ctx.stream());
 #else
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream());
@@ -149,7 +146,7 @@ void DeserializeOnCPU(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 void SerializeOnGPU(const phi::GPUContext& dev_ctx,
                     const StringTensor& src,
                     DenseTensor* dst) {
@@ -182,9 +179,6 @@ void DeserializeOnGPU(const phi::GPUContext& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), hipMemcpyDeviceToHost);
-#elif defined(PADDLE_WITH_MUSA)
-  phi::backends::gpu::GpuMemcpySync(
-      &numel, strings_data, sizeof(numel), musaMemcpyDeviceToHost);
 #else
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), cudaMemcpyDeviceToHost);
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc
index 2c59e6c4e346e8..10d958f354e2d3 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     phi::strings::EmptyLikeKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index c50d4a6950e26e..292160e2b2db1a 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -46,7 +46,7 @@ const uint16_t* GetCharcasesMap() {
   return reinterpret_cast<const uint16_t*>(utils_map[0]);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 const uint8_t* GetGPUUniflagMap() {
   if (utils_map[3] == nullptr) {
@@ -57,10 +57,6 @@ const uint8_t* GetGPUUniflagMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_uniflag, cpu_uniflag, size, hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
-    phi::backends::gpu::GpuMemcpySync(
-        gpu_uniflag, cpu_uniflag, size, musaMemcpyHostToDevice);      
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
@@ -80,10 +76,6 @@ const uint16_t* GetGPUCharcasesMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_charcases, cpu_charcases, size, hipMemcpyHostToDevice);
-#elif defined(PADDLE_WITH_MUSA)
-    musaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
-    phi::backends::gpu::GpuMemcpySync(
-        gpu_charcases, cpu_charcases, size, musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index ab0d051a088a51..410543c27d68fc 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -188,7 +188,7 @@ HOSTDEVICE inline void GetUTF8Str(const uint32_t* unicode_str,
 const uint8_t* GetUniFlagMap();
 const uint16_t* GetCharcasesMap();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 const uint8_t* GetGPUUniflagMap();
 const uint16_t* GetGPUCharcasesMap();
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 5274a84e3c3125..e4a7f4166c001d 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -71,7 +71,7 @@ void TransferLayoutGeneral(const Context& dev_ctx,
 
   out->Resize(common::make_ddim(dst_dim));
   dev_ctx.Alloc(out, x.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // In GPU fp16 model, we will insert many transfer_layout ops in
   // fused_conv2d_add_act_layout_transfer_pass, so we optimize this kernel on
   // GPU
@@ -222,7 +222,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index 4426c744d4931d..1603b1e2f63987 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -50,7 +50,7 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index 605e45fd2368d1..1f023a7cfb5f4a 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -80,7 +80,7 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(unsqueeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
index d1ad332cd626c5..c5d33ae4ac8d06 100644
--- a/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/set_value_grad_kernel.cc
@@ -397,6 +397,28 @@ void SetValueGradKernel(const Context& dev_ctx,
   }
 }
 
+template <typename T, typename Context>
+void SetValueWithScalarGradKernel(const Context& dev_ctx,
+                                  const DenseTensor& out_grad,
+                                  const IntArray& starts,
+                                  const IntArray& ends,
+                                  const IntArray& steps,
+                                  const std::vector<int64_t>& axes,
+                                  const std::vector<int64_t>& decrease_axes,
+                                  const std::vector<int64_t>& none_axes,
+                                  DenseTensor* x_grad) {
+  SetValueGradKernel<T, Context>(dev_ctx,
+                                 out_grad,
+                                 starts,
+                                 ends,
+                                 steps,
+                                 axes,
+                                 decrease_axes,
+                                 none_axes,
+                                 x_grad,
+                                 nullptr);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(set_value_grad,
@@ -407,3 +429,12 @@ PD_REGISTER_KERNEL(set_value_grad,
                    phi::dtype::float16,
                    int,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(set_value_with_scalar_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::SetValueWithScalarGradKernel,
+                   float,
+                   phi::dtype::float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/tools/CMakeLists.txt b/paddle/phi/tools/CMakeLists.txt
index d1cd4b9f3ec182..e6ae73384180eb 100644
--- a/paddle/phi/tools/CMakeLists.txt
+++ b/paddle/phi/tools/CMakeLists.txt
@@ -5,10 +5,6 @@ if(WITH_GPU)
   endif()
 endif()
 
-if(WITH_MUSA)
-  return()
-endif()
-
 add_executable(print_phi_kernels print_phi_kernels.cc)
 target_link_libraries(print_phi_kernels phi common)
 if(WIN32)
diff --git a/patches/eigen/Complex.h.patch b/patches/eigen/Complex.h.patch
index 140e4ba8409400..d005ea7f5d6f27 100644
--- a/patches/eigen/Complex.h.patch
+++ b/patches/eigen/Complex.h.patch
@@ -1,7 +1,7 @@
-diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
-index f6f1b8c..2f4e7d1 100644
---- a/Eigen/src/Core/arch/SSE/Complex.h
-+++ b/Eigen/src/Core/arch/SSE/Complex.h
+diff --git a/old/Complex.h b/new/Complex.h
+index f6f1b8c..7558e8c 100644
+--- a/old/Complex.h
++++ b/new/Complex.h
 @@ -17,7 +17,7 @@ namespace internal {
  //---------- float ----------
  struct Packet2cf
@@ -11,3 +11,28 @@ index f6f1b8c..2f4e7d1 100644
    EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
    __m128  v;
  };
+@@ -113,19 +113,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<fl
+ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+ {
+   Packet2cf res;
+-#if EIGEN_GNUC_AT_MOST(4,2)
+-  // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
+-  res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
+-#elif EIGEN_GNUC_AT_LEAST(4,6)
+-  // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
+-  #pragma GCC diagnostic push
+-  #pragma GCC diagnostic ignored "-Wuninitialized"
+-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+-  #pragma GCC diagnostic pop
++#ifdef EIGEN_VECTORIZE_SSE3
++  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
+ #else
+-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
++  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
++  res.v = _mm_movelh_ps(res.v, res.v);
+ #endif
+-  return Packet2cf(_mm_movelh_ps(res.v,res.v));
++  return res;
+ }
+ 
+ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
diff --git a/patches/eigen/Eigen_CORE.patch b/patches/eigen/Eigen_CORE.patch
deleted file mode 100644
index cfa265d1d1c396..00000000000000
--- a/patches/eigen/Eigen_CORE.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/Eigen/Core b/Eigen/Core
-index 5921e15f9..772870d88 100644
---- a/Eigen/Core
-+++ b/Eigen/Core
-@@ -25,6 +25,8 @@
- // the EIGEN_USING_STD macro works properly on the device side
- #if defined(EIGEN_CUDACC)
-   #include <cuda_runtime.h>
-+#elif defined(EIGEN_MUSACC)
-+  #include <musa_runtime.h>
- #elif defined(EIGEN_HIPCC)
-   #include <hip/hip_runtime.h>
- #endif
diff --git a/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch b/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
deleted file mode 100644
index 81c05b241e57e7..00000000000000
--- a/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
+++ /dev/null
@@ -1,21 +0,0 @@
-diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
-index af4e69623..6944be650 100644
---- a/Eigen/src/Core/util/ConfigureVectorization.h
-+++ b/Eigen/src/Core/util/ConfigureVectorization.h
-@@ -470,6 +470,16 @@
-   #include <hip/hip_fp16.h>
- #endif
- 
-+#ifdef EIGEN_MUSACC
-+  #define EIGEN_VECTORIZE_GPU
-+  #include <vector_types.h>
-+  #define EIGEN_HAS_MUSA_FP16
-+#endif
-+
-+#ifdef EIGEN_HAS_MUSA_FP16
-+  #include <musa_runtime_api.h>
-+  #include <musa_fp16.h>
-+#endif
- 
- /** \brief Namespace containing all symbols from the %Eigen library. */
- namespace Eigen {
diff --git a/patches/eigen/Eigen_src_Core_util_Macros.h.patch b/patches/eigen/Eigen_src_Core_util_Macros.h.patch
deleted file mode 100644
index 6de32cf63e259c..00000000000000
--- a/patches/eigen/Eigen_src_Core_util_Macros.h.patch
+++ /dev/null
@@ -1,51 +0,0 @@
-diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
-index bdc0de0ea..8ffbd4291 100644
---- a/Eigen/src/Core/util/Macros.h
-+++ b/Eigen/src/Core/util/Macros.h
-@@ -476,6 +476,19 @@
-   #define EIGEN_CUDA_SDK_VER 0
- #endif
- 
-+#if defined(__MUSACC__) && !defined(EIGEN_NO_MUSA)
-+  // Means the compiler is either nvcc or clang with CUDA enabled
-+  #define EIGEN_MUSACC __MUSACC__
-+#endif
-+#if defined(EIGEN_MUSACC)
-+#include <musa.h>
-+#endif
-+
-+#if defined(__MUSA_ARCH__) && !defined(EIGEN_NO_MUSA)
-+  // Means we are generating code for the device
-+  #define EIGEN_MUSA_ARCH __MUSA_ARCH__
-+#endif
-+
- #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
-   // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
-   #define EIGEN_HIPCC __HIPCC__
-@@ -512,7 +525,7 @@
- 
- // Unify CUDA/HIPCC
- 
--#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
-+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_MUSACC)
- //
- // If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
- //
-@@ -535,7 +548,7 @@
- //
- #endif
- 
--#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
-+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_MUSA_ARCH)
- //
- // If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
- //
-@@ -943,7 +956,7 @@
- // GPU stuff
- 
- // Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
--#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
-+#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC) || defined(EIGEN_MUSACC)
-   // Do not try asserts on device code
-   #ifndef EIGEN_NO_DEBUG
-   #define EIGEN_NO_DEBUG
diff --git a/patches/eigen/Eigen_src_Core_util_Meta.h.patch b/patches/eigen/Eigen_src_Core_util_Meta.h.patch
deleted file mode 100644
index 75fab917c22595..00000000000000
--- a/patches/eigen/Eigen_src_Core_util_Meta.h.patch
+++ /dev/null
@@ -1,58 +0,0 @@
-diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
-index cad57c3a4..785b2a7a9 100755
---- a/Eigen/src/Core/util/Meta.h
-+++ b/Eigen/src/Core/util/Meta.h
-@@ -15,7 +15,7 @@
- 
-  #include <cfloat>
- 
-- #if defined(EIGEN_CUDA_ARCH)
-+ #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_MUSA_ARCH)
-   #include <math_constants.h>
-  #endif
- 
-@@ -300,6 +300,8 @@ template<> struct numeric_limits<float>
-   static float (max)() {
-   #if defined(EIGEN_CUDA_ARCH)
-     return CUDART_MAX_NORMAL_F;
-+  #elif defined(EIGEN_MUSA_ARCH)
-+    return MUSART_MAX_NORMAL_F;
-   #else
-     return HIPRT_MAX_NORMAL_F;
-   #endif
-@@ -310,6 +312,8 @@ template<> struct numeric_limits<float>
-   static float infinity() {
-   #if defined(EIGEN_CUDA_ARCH)
-     return CUDART_INF_F;
-+  #elif defined(EIGEN_MUSA_ARCH)
-+    return MUSART_INF_F;
-   #else
-     return HIPRT_INF_F;
-   #endif
-@@ -318,6 +322,8 @@ template<> struct numeric_limits<float>
-   static float quiet_NaN() {
-   #if defined(EIGEN_CUDA_ARCH)
-     return CUDART_NAN_F;
-+  #elif defined(EIGEN_MUSA_ARCH)
-+    return MUSART_NAN_F;
-   #else
-     return HIPRT_NAN_F;
-   #endif
-@@ -335,6 +341,8 @@ template<> struct numeric_limits<double>
-   static double infinity() {
-   #if defined(EIGEN_CUDA_ARCH)
-     return CUDART_INF;
-+  #elif defined(EIGEN_MUSA_ARCH)
-+    return MUSART_INF;
-   #else
-     return HIPRT_INF;
-   #endif
-@@ -343,6 +351,8 @@ template<> struct numeric_limits<double>
-   static double quiet_NaN() {
-   #if defined(EIGEN_CUDA_ARCH)
-     return CUDART_NAN;
-+  #elif defined(EIGEN_MUSA_ARCH)
-+    return MUSART_NAN;
-   #else
-     return HIPRT_NAN;
-   #endif
diff --git a/patches/eigen/TensorReductionGpu.h b/patches/eigen/TensorReductionGpu.h
index 696078e54881af..4807aaa2c1be75 100644
--- a/patches/eigen/TensorReductionGpu.h
+++ b/patches/eigen/TensorReductionGpu.h
@@ -14,7 +14,7 @@ namespace Eigen {
 namespace internal {
 
 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-// Full reducers for GPU, don't vectorize for now
+// Full reducers for GPU, don't common::vectorize for now
 
 // Reducer function that enables multiple gpu thread to safely accumulate at the same
 // output address. It basically reads the current value of the output variable, and
diff --git a/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch b/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
deleted file mode 100644
index c7d9f134ff004f..00000000000000
--- a/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
-index d73c6008d..73c02cc50 100644
---- a/unsupported/Eigen/CXX11/Tensor
-+++ b/unsupported/Eigen/CXX11/Tensor
-@@ -57,6 +57,8 @@
-   #include <iostream>
-   #if defined(EIGEN_USE_HIP)
-     #include <hip/hip_runtime.h>
-+  #elif defined(EIGEN_USE_MUSA)
-+    #include <musa_runtime.h>
-   #else
-     #include <cuda_runtime.h>
-   #endif
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
deleted file mode 100644
index aadb9080b24b29..00000000000000
--- a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
-index bb990b378..07f93ab18 100644
---- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
-+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
-@@ -621,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
-       x1 = rhs_pf0.x;
-       x2 = rhs_pf0.z;
-     }
--    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
-+    #if !defined(EIGEN_MUSACC) && (defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000))
-     x1 = __shfl_xor(x1, 4);
-     x2 = __shfl_xor(x2, 4);
-     #else
-@@ -1399,6 +1399,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
- 
- #if defined(EIGEN_USE_HIP)
-     setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
-+#elif defined(EIGEN_USE_MUSA)
-+    setGpuSharedMemConfig(musaSharedMemBankSizeEightByte);
- #else
-     setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
- #endif
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
deleted file mode 100644
index 60be20eb476e06..00000000000000
--- a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
-index 46b9d3ab2..3bef5b621 100644
---- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
-+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
-@@ -92,6 +92,10 @@ struct DefaultDevice {
-     // Running on a HIP device
-     // return 1 as major for HIP
-     return 1;
-+#elif defined(EIGEN_MUSA_ARCH)
-+    // Running on a MUSA device
-+    // return 1 as major for MUSA
-+    return 1;
- #else
-     // Running on a CUDA device
-     return EIGEN_CUDA_ARCH / 100;
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
deleted file mode 100644
index 631770d22e49ed..00000000000000
--- a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
-index 9422dcd7a..e889b7eca 100644
---- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
-+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
-@@ -331,6 +331,10 @@ struct GpuDevice {
-   hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
-   gpu_assert(hipGetLastError() == hipSuccess);
- 
-+#elif defined(EIGEN_MUSACC)
-+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
-+  gpu_assert(musaGetLastError() == musaSuccess);
- #else
-  
- #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
deleted file mode 100644
index 3d8b48c3d6af60..00000000000000
--- a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
+++ /dev/null
@@ -1,40 +0,0 @@
-diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
-index cb53ce298..19dc119a9 100644
---- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
-+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
-@@ -52,6 +52,35 @@
- #define gpuDeviceSynchronize hipDeviceSynchronize
- #define gpuMemcpy hipMemcpy
- 
-+#elif defined(EIGEN_USE_MUSA)
-+#define gpuStream_t musaStream_t
-+#define gpuDeviceProp_t musaDeviceProp
-+#define gpuError_t musaError_t
-+#define gpuSuccess musaSuccess
-+#define gpuErrorNotReady musaErrorNotReady
-+#define gpuGetDeviceCount musaGetDeviceCount
-+#define gpuGetLastError musaGetLastError
-+#define gpuPeekAtLastError musaPeekAtLastError
-+#define gpuGetErrorName musaGetErrorName
-+#define gpuGetErrorString musaGetErrorString
-+#define gpuGetDeviceProperties musaGetDeviceProperties
-+#define gpuStreamDefault musaStreamDefault
-+#define gpuGetDevice musaGetDevice
-+#define gpuSetDevice musaSetDevice
-+#define gpuMalloc musaMalloc
-+#define gpuFree musaFree
-+#define gpuMemsetAsync musaMemsetAsync
-+#define gpuMemcpyAsync musaMemcpyAsync
-+#define gpuMemcpyDeviceToDevice musaMemcpyDeviceToDevice
-+#define gpuMemcpyDeviceToHost musaMemcpyDeviceToHost
-+#define gpuMemcpyHostToDevice musaMemcpyHostToDevice
-+#define gpuStreamQuery musaStreamQuery
-+#define gpuSharedMemConfig musaSharedMemConfig
-+#define gpuDeviceSetSharedMemConfig musaDeviceSetSharedMemConfig
-+#define gpuStreamSynchronize musaStreamSynchronize
-+#define gpuDeviceSynchronize musaDeviceSynchronize
-+#define gpuMemcpy musaMemcpy
-+
- #else
- 
- #define gpuStream_t cudaStream_t
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch
deleted file mode 100644
index 497f464e461d72..00000000000000
--- a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
-index 0a65591e6..74679b700 100644
---- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
-+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
-@@ -14,7 +14,7 @@
- // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
- // so we'll use a macro to make clang happy.
- #ifndef KERNEL_FRIEND
--#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__) || defined(__MUSA__))
- #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
- #else
- #define KERNEL_FRIEND friend
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index cd8fcc4048f623..fddfba389ae4ed 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -48,8 +48,6 @@ if(WITH_GPU)
   set(PACKAGE_NAME "paddlepaddle-gpu")
 elseif(WITH_ROCM)
   set(PACKAGE_NAME "paddlepaddle-rocm")
-elseif(WITH_MUSA)
-  set(PACKAGE_NAME "paddlepaddle-musa")
 elseif(WITH_XPU)
   set(PACKAGE_NAME "paddlepaddle-xpu")
 elseif(WITH_IPU)
diff --git a/python/cinn/compiler/expr_executor.py b/python/cinn/compiler/expr_executor.py
index cff9a9d62d7c43..c888be369c3d6e 100644
--- a/python/cinn/compiler/expr_executor.py
+++ b/python/cinn/compiler/expr_executor.py
@@ -81,14 +81,15 @@ def visit(self, node):
             value = exec_func(cls_fields)
         else:
             new_node = node.__class__(**cls_fields)
-            ast.copy_location(new_node, node)
-            new_node = ast.Expression(new_node)
             value = self.exec_expr(new_node)
         return self.save_temp_value(value)
 
     def exec_expr(self, node):
-        if isinstance(node, ast.expr):
-            node = ast.Expression(body=node)
+        assert isinstance(node, ast.expr)
+        if type(node).__name__ == "Constant":
+            return node.value
+
+        node = ast.Expression(node)
         node = ast.fix_missing_locations(node)
         exec = compile(node, filename="<ast>", mode="eval")
         return eval(exec, self.var_table)
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index 4731a3750ddf6e..58900192743e47 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -74,7 +74,6 @@ env_dict={
     'APPLE':'@APPLE@',
     'externalError_INCLUDE_DIR':'@externalError_INCLUDE_DIR@',
     'WITH_ROCM':'@WITH_ROCM@',
-    'WITH_MUSA':'@WITH_MUSA@',
     'ORIGIN':'@ORIGIN@',
     'WIN32':'@WIN32@',
     'JIT_RELEASE_WHL':'@JIT_RELEASE_WHL@',
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index d8d3b110a5e0dd..f6b3aca02c2a61 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -517,7 +517,6 @@
     is_compiled_with_distribute,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
-    is_compiled_with_musa,
     is_compiled_with_custom_device,
 )
 
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index a0eab9920fb737..7e5ac9c1d92c44 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -55,7 +55,6 @@
     is_compiled_with_cinn,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
-    is_compiled_with_musa,
     is_compiled_with_xpu,
     Variable,
     require_version,
diff --git a/python/paddle/base/dygraph/tensor_patch_methods.py b/python/paddle/base/dygraph/tensor_patch_methods.py
index cc12d50a6069f2..7b4c81cfa323d0 100644
--- a/python/paddle/base/dygraph/tensor_patch_methods.py
+++ b/python/paddle/base/dygraph/tensor_patch_methods.py
@@ -876,7 +876,7 @@ def __array__(self, dtype=None):
             array = array.astype(dtype)
         return array
 
-    def pre_deal_index_and_value(self, item, value=None):
+    def pre_deal_index(self, item):
         # since in pybind there is no effiency way to transfer Py_Tuple/Py_List/Py_Range to Tensor
         # we call this function in python level.
         item = list(item) if isinstance(item, tuple) else [item]
@@ -886,17 +886,14 @@ def pre_deal_index_and_value(self, item, value=None):
             elif isinstance(slice_item, range):
                 item[i] = paddle.to_tensor(list(slice_item))
 
-        if value is not None and not isinstance(value, Variable):
-            value = paddle.to_tensor(value, dtype=self.dtype)
-
-        return tuple(item), value
+        return tuple(item)
 
     def __getitem__(self, item):
-        item, _ = pre_deal_index_and_value(self, item)
+        item = pre_deal_index(self, item)
         return self._getitem_dygraph(item)
 
     def __setitem__(self, item, value):
-        item, value = pre_deal_index_and_value(self, item, value)
+        item = pre_deal_index(self, item)
         return self._setitem_dygraph(item, value)
 
     @framework.dygraph_only
diff --git a/python/paddle/base/executor.py b/python/paddle/base/executor.py
index fff2771da14c2f..bb8a4bc7b10ab0 100755
--- a/python/paddle/base/executor.py
+++ b/python/paddle/base/executor.py
@@ -682,7 +682,7 @@ def _get_varname_from_block(block):
     )
 
 
-def _get_program_cache_key(feed, fetch_list):
+def _get_feed_fetch_var_names(feed, fetch_list):
     feed_var_names = []
     if isinstance(feed, dict):
         feed_var_names = list(feed.keys())
@@ -690,7 +690,11 @@ def _get_program_cache_key(feed, fetch_list):
         for i, each in enumerate(feed):
             feed_var_names += list(each.keys())
     fetch_var_names = list(map(_to_name_str, fetch_list))
-    return str(feed_var_names + fetch_var_names)
+    return feed_var_names + fetch_var_names
+
+
+def _get_program_cache_key(feed, fetch_list):
+    return str(_get_feed_fetch_var_names(feed, fetch_list))
 
 
 def _as_lodtensor(data, place, dtype=None):
@@ -1026,7 +1030,7 @@ def _get_program_and_executor(self, cached_data):
 
         if enable_inplace or enable_addto:
             # inplace should skip feed and fetch var
-            skip_var_names = eval(_get_program_cache_key(feed, fetch_list))
+            skip_var_names = _get_feed_fetch_var_names(feed, fetch_list)
             _apply_inplace_addto_pass(
                 program, enable_inplace, enable_addto, skip_var_names
             )
@@ -2476,7 +2480,7 @@ def _run_from_dataset(
 
         reused_trainer = program._heter_pipeline_opt is not None or (
             program._fleet_opt is not None
-            and program._fleet_opt.get("use_ps_gpu", True)
+            and program._fleet_opt.get("use_ps_gpu", False)
         )
 
         if reused_trainer is False:
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index c0d128d4fbbb31..0c4079fd199306 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -887,20 +887,6 @@ def is_compiled_with_rocm():
     """
     return core.is_compiled_with_rocm()
 
-def is_compiled_with_musa():
-    """
-    Whether this whl package can be used to run the model on musa.
-
-    Returns:
-        Bool: `True` if ROCm is currently available, otherwise `False`.
-
-    Examples:
-        .. code-block:: python
-
-            >>> import paddle
-            >>> support_gpu = paddle.device.is_compiled_with_musa()
-    """
-    return core.is_compiled_with_musa()
 
 def cuda_places(device_ids=None):
     """
@@ -5582,8 +5568,7 @@ def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
             exited_code = subprocess.call(
-                'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
-                shell=True,
+                ['dot', '-Tpdf', dot_file_path, '-o', pdf_save_path]
             )
             if exited_code != 0:
                 print('The dot command is needed for creating pdf files.')
diff --git a/python/paddle/base/layers/math_op_patch.py b/python/paddle/base/layers/math_op_patch.py
index bf1d737970327d..f3ba8aa5e197e8 100644
--- a/python/paddle/base/layers/math_op_patch.py
+++ b/python/paddle/base/layers/math_op_patch.py
@@ -538,8 +538,10 @@ def __impl__(self, other_var):
                             op_type, lhs_dtype, rhs_dtype
                         )
                         warnings.warn(
-                            f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, "
-                            "the output will be auto-promoted to {common_dtype}"
+                            f"The input dtypes of OP {op_type} are {lhs_dtype} and {rhs_dtype}, the output will be auto-promoted to {common_dtype}"
+                        )
+                        warnings.filterwarnings(
+                            "ignore", message="The input dtypes of OP"
                         )
                         if rhs_dtype != common_dtype:
                             other_var = astype(other_var, common_dtype)
diff --git a/python/paddle/base/variable_index.py b/python/paddle/base/variable_index.py
index f3a04076ef3fbd..533d0360764a8a 100644
--- a/python/paddle/base/variable_index.py
+++ b/python/paddle/base/variable_index.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 import warnings
 
 import numpy as np
@@ -136,7 +137,6 @@ def deal_attrs(attrs, attr, attr_name, tensor_attr_name, inputs, infer_flags):
         attrs[attr_name] = attr
 
 
-# the item is a tensor of bool
 def get_value_for_bool_tensor(var, item):
     if len(item.shape) > len(var.shape):
         raise IndexError(
@@ -191,7 +191,9 @@ def _setitem_for_tensor_array(var, item, value):
         )
 
 
-def deal_advanced_index(ori_tensor, indices, is_for_setitem):
+def deal_advanced_index(
+    ori_tensor, indices, is_for_setitem, values, out_is_view=True
+):
     """
     Transpose origin Tensor and advanced indices to the front.
 
@@ -201,6 +203,7 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem):
         trans_back_dim (List): order of axes to transpose back to original order. Only used in __setitem__.
         pos_of_new_dim (int):  axis of new dim in the result. Only used in __getitem__.
         rank_of_new_dim (int): rank of new dim in the result. Only used in __getitem__.
+        transed_value_tensor (Tensor): value tensor transed to the front. Only used in __setitem__.
     """
     transed_dim = []
     transed_index = []
@@ -212,24 +215,38 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem):
 
     for i, indice in enumerate(indices):
         if indice is not None:
-            if not is_for_setitem:
-                if i == 0:
-                    # case 1: advanced indices at axis 0, the new dim will be at first.
-                    pos_of_new_dim = 0
-                if i > 0 and len(transed_dim) > 0 and transed_dim[-1] != i - 1:
-                    # case 2: there are not adjacent advanced indices, the new dim will be at first.
-                    pos_of_new_dim = 0
-                else:
-                    pos_of_new_dim = min(pos_of_new_dim, i)
-                rank_of_new_dim = max(rank_of_new_dim, indice[1].ndim)
+            if i == 0:
+                # case 1: advanced indices at axis 0, the new dim will be at first.
+                pos_of_new_dim = 0
+            if i > 0 and len(transed_dim) > 0 and transed_dim[-1] != i - 1:
+                # case 2: there are not adjacent advanced indices, the new dim will be at first.
+                pos_of_new_dim = 0
+            else:
+                pos_of_new_dim = min(pos_of_new_dim, i)
+            rank_of_new_dim = max(rank_of_new_dim, indice[1].ndim)
             transed_dim.append(i)
             transed_index.append(indice[1])
     for i in range(ori_tensor.ndim):
         if indices[i] is None:
             transed_dim.append(i)
-    transed_tensor = ori_tensor.transpose(transed_dim)
 
     trans_back_dim = np.argsort(transed_dim).tolist() if is_for_setitem else []
+    transed_value_tensor = None
+
+    if transed_dim == list(range(ori_tensor.ndim)):
+        transed_tensor = ori_tensor
+        if is_for_setitem:
+            transed_value_tensor = values
+    else:
+        out_is_view = True
+        transed_tensor = ori_tensor.transpose(transed_dim)
+        if is_for_setitem:
+            if values.ndim > 1 and pos_of_new_dim != 0:
+                # If the value tensor is not a scalar / 1-D Tensor, and the src tensor was
+                # transposed at 1st dim, the value tensor should be transposed too.
+                transed_value_tensor = values.transpose(transed_dim)
+            else:
+                transed_value_tensor = values
 
     return (
         transed_tensor,
@@ -237,11 +254,25 @@ def deal_advanced_index(ori_tensor, indices, is_for_setitem):
         trans_back_dim,
         pos_of_new_dim,
         rank_of_new_dim,
+        transed_value_tensor,
+        out_is_view,
     )
 
 
 def parse_index(x, indices):
-    advanced_index = [None] * 2 * len(x.shape)  # content is (dim, index)
+    from .framework import in_pir_mode
+
+    if in_pir_mode():
+        is_tensor_array = x.is_dense_tensor_array_type()
+    else:
+        is_tensor_array = (
+            hasattr(x, "desc")
+            and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        )
+
+    advanced_index = (
+        [] if is_tensor_array else [None] * 2 * len(x.shape)
+    )  # content is (dim, index)
     # for set_value / slice / strided_slice OP
     decrease_axes = []
     axes = []
@@ -258,11 +289,6 @@ def parse_index(x, indices):
     indices = replace_ellipsis(x, indices)
     indices, none_axes = replace_none(indices)
 
-    is_tensor_array = (
-        hasattr(x, "desc")
-        and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
-    )
-
     estimated_dim = 0
     dim = 0
     for i, slice_item in enumerate(indices):
@@ -550,7 +576,12 @@ def _setitem_static(x, indices, values):
         #   3. assign values to the sliced result by index_put OP;
         #   4. transpose back and assign the result to original tensor by set_value OP.
 
-        sub_tensor = get_tensor_with_basic_indexing(
+        if not isinstance(
+            values, (Variable, paddle.pir.Value, paddle.pir.OpResult)
+        ):
+            values = paddle.assign(values).astype(x.dtype)
+
+        sub_tensor, is_view = get_tensor_with_basic_indexing(
             x,
             axes,
             starts,
@@ -566,18 +597,41 @@ def _setitem_static(x, indices, values):
             transback_dim,
             _,
             _,
-        ) = deal_advanced_index(sub_tensor, advanced_index, True)
-        if not isinstance(values, (Variable, paddle.pir.Value)):
-            values = paddle.assign(values).astype(transed_sub_tensor.dtype)
+            values,
+            is_view,
+        ) = deal_advanced_index(
+            sub_tensor, advanced_index, True, values, is_view
+        )
 
         if values.dtype != transed_sub_tensor.dtype:
             values = values.astype(transed_sub_tensor.dtype)
 
-        if in_dynamic_or_pir_mode():
-            # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed.
-            transed_sub_tensor = transed_sub_tensor.index_put_(
-                adjusted_advanced_index, values
-            )
+        if paddle.in_dynamic_mode():
+            if (
+                len(adjusted_advanced_index) == 1
+                and adjusted_advanced_index[0].dtype
+                in (paddle.bool, paddle.base.libpaddle.BOOL)
+                and len(
+                    adjusted_advanced_index[0].shape
+                    == len(transed_sub_tensor.shape)
+                )
+            ):
+                if values.shape != transed_sub_tensor.shape:
+                    values = values.expand(transed_sub_tensor.shape)
+                transed_sub_tensor = paddle._C_ops.where_(
+                    paddle.logical_not(adjusted_advanced_index[0]),
+                    transed_sub_tensor,
+                    values,
+                )
+                if not is_view:
+                    return x
+            else:
+                # NOTE(zoooo0820): directly return result instead of another set_value, after backward bug fixed.
+                transed_sub_tensor = transed_sub_tensor.index_put_(
+                    adjusted_advanced_index, values
+                )
+                if not is_view:
+                    return x
         else:
             transed_sub_tensor = transed_sub_tensor.index_put(
                 adjusted_advanced_index, values
@@ -624,12 +678,14 @@ def get_tensor_with_basic_indexing(
 ):
     from .dygraph.base import in_to_static_mode
 
+    out_is_view = False
     if in_to_static_mode() and hasattr(x, "is_view_var"):
         x.is_view_var = True
 
     if len(axes) == 0:
         out = x
     else:
+        out_is_view = True
         op_type = "strided_slice" if use_strided_slice else "slice"
         inputs = {'Input': [x]}
         attrs = {
@@ -677,6 +733,8 @@ def get_tensor_with_basic_indexing(
                     if isinstance(end, (list, tuple)):
                         if paddle.utils._contain_var(end):
                             end = paddle.utils.get_int_tensor_list(end)
+                    if x.is_dense_tensor_array_type():
+                        return paddle._pir_ops.slice_array_dense(x, st), False
                 out = paddle._C_ops.slice(
                     x,
                     axes,
@@ -703,17 +761,9 @@ def get_tensor_with_basic_indexing(
                 attrs=attrs,
             )
             out = slice_out_var
-    # NOTE(zoooo0820): When all axes are decreased, the output will be 1-D
-    # with FLAGS_set_to_1d=True. In this case, one `None` should be pop out,
-    # otherwise the output shape will be not correct.
-    set_to_1d = paddle.get_flags('FLAGS_set_to_1d')['FLAGS_set_to_1d']
-    if set_to_1d and len(decrease_axes) == len(x.shape):
-        warnings.warn(
-            "Warning: In Tensor '__getitem__', if the number of scalar elements in the index is equal to the rank of the Tensor, the output should be 0-D. In order to be consistent with the behavior of previous versions, it will be processed to 1-D. But it is not correct and will be removed in release 2.6. If 1-D is still wanted, please modify the index element from scalar to slice (e.g. 'x[i]' => 'x[i:i+1]')."
-        )
-        none_axes = none_axes[1:]
 
     if len(none_axes) > 0:
+        out_is_view = True
         # Deal with cases that decrease_axes is not empty
         # For example:
         # # x.shape: (2,3,4)
@@ -727,7 +777,7 @@ def get_tensor_with_basic_indexing(
 
     if in_to_static_mode() and hasattr(out, "is_view_var"):
         out.is_view_var = True
-    return out
+    return out, out_is_view
 
 
 def _getitem_static(x, indices):
@@ -750,7 +800,7 @@ def _getitem_static(x, indices):
     ) = parse_index(x, indices)
 
     # step2: Dealing with basic indexing
-    out = get_tensor_with_basic_indexing(
+    out, _ = get_tensor_with_basic_indexing(
         x,
         axes,
         starts,
@@ -769,13 +819,14 @@ def _getitem_static(x, indices):
             _,
             pos_of_new_dim,
             rank_of_new_dim,
-        ) = deal_advanced_index(out, advanced_index, False)
+            _,
+            _,
+        ) = deal_advanced_index(out, advanced_index, False, None)
 
         # TODO(zooooo0820): Replacing gather_nd to another advanded OP for handling of mixed indexes more efficiently
-        if (
-            len(adjusted_advanced_index) == 1
-            and adjusted_advanced_index[0].dtype == paddle.bool
-        ):
+        if len(adjusted_advanced_index) == 1 and adjusted_advanced_index[
+            0
+        ].dtype in (paddle.bool, paddle.base.libpaddle.BOOL):
             # Note: now slice not support 0-size Tensor, so only one bool tensor can return empty 0-size.
             out = get_value_for_bool_tensor(
                 transed_tensor, adjusted_advanced_index[0]
@@ -797,8 +848,8 @@ def _getitem_static(x, indices):
 
         if pos_of_new_dim != 0:
             perm = (
-                list(range(pos_of_new_dim, pos_of_new_dim + rank_of_new_dim))
-                + list(range(0, pos_of_new_dim))
+                list(range(rank_of_new_dim, pos_of_new_dim + rank_of_new_dim))
+                + list(range(0, rank_of_new_dim))
                 + list(range(pos_of_new_dim + rank_of_new_dim, out.ndim))
             )
             out = out.transpose(perm)
diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py
index 4695b633ffa0fd..35155a2de2d226 100644
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -18,6 +18,7 @@
 import importlib
 import os
 import pickle
+import re
 import shutil
 import sys
 import tempfile
@@ -71,6 +72,11 @@ def md5file(fname):
 
 
 def download(url, module_name, md5sum, save_name=None):
+    module_name = re.match("^[a-zA-Z0-9_/\\-]+$", module_name).group()
+    if isinstance(save_name, str):
+        save_name = re.match(
+            "^(?:(?!\\.\\.)[a-zA-Z0-9_/\\.-])+$", save_name
+        ).group()
     dirname = os.path.join(DATA_HOME, module_name)
     if not os.path.exists(dirname):
         os.makedirs(dirname)
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index f047ee7411f4a6..b4fd9629358137 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -24,7 +24,6 @@
     is_compiled_with_cuda,
     is_compiled_with_distribute,
     is_compiled_with_rocm,
-    is_compiled_with_musa
 )
 
 from . import (  # noqa: F401
@@ -43,7 +42,6 @@
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
-    'is_compiled_with_musa',
     'is_compiled_with_distribute',
     'is_compiled_with_custom_device',
     'get_all_device_type',
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 01ff24c5fc1a06..598bf64a103871 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -21,10 +21,9 @@
     CUDAPlace,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
-    is_compiled_with_musa
 )
 
-if is_compiled_with_cuda() and not is_compiled_with_rocm() and not is_compiled_with_musa():
+if is_compiled_with_cuda() and not is_compiled_with_rocm():
     from paddle.base.core import CUDAGraph as CoreCUDAGraph
 
     def is_cuda_graph_supported():
diff --git a/python/paddle/distributed/auto_tuner/prune.py b/python/paddle/distributed/auto_tuner/prune.py
index 94f0a67a21debc..a86ce0f31dd367 100644
--- a/python/paddle/distributed/auto_tuner/prune.py
+++ b/python/paddle/distributed/auto_tuner/prune.py
@@ -510,17 +510,42 @@ def prune_by_memory_estimation(tuner_cfg, cur_cfg, history_cfgs=[]):
             "max_mem_usage should be set when using memory estimation tool"
         )
 
-    memory_estimation_cmd = f"python {memory_estimation_tool} --dp_degree {cur_cfg['dp_degree']} --mp_degree {cur_cfg['mp_degree']} \
-                                --pp_degree {cur_cfg['pp_degree']} --vpp_degree {cur_cfg['vpp_degree']} \
-                                --sharding_degree {cur_cfg['sharding_degree']} --sharding_stage {cur_cfg['sharding_stage']} \
-                                --use_recompute {cur_cfg['use_recompute']} --micro_batch_size {cur_cfg['micro_batch_size']} \
-                                --recompute_granularity {cur_cfg['recompute_granularity']} \
-                                --hidden_size {model_cfg['hidden_size']} --num_attention_heads {model_cfg['num_attention_heads']} \
-                                --num_layers {model_cfg['num_layers']} --max_sequence_length {model_cfg['max_sequence_length']} \
-                                --vocab_size {model_cfg['vocab_size']} --intermediate_size {model_cfg['intermediate_size']} "
+    memory_estimation_cmd = [
+        "python",
+        memory_estimation_tool,
+        "--dp_degree",
+        str(cur_cfg['dp_degree']),
+        "--mp_degree",
+        str(cur_cfg['mp_degree']),
+        "--pp_degree",
+        str(cur_cfg['pp_degree']),
+        "--vpp_degree",
+        str(cur_cfg['vpp_degree']),
+        "--sharding_degree",
+        str(cur_cfg['sharding_degree']),
+        "--sharding_stage",
+        str(cur_cfg['sharding_stage']),
+        "--use_recompute",
+        str(cur_cfg['use_recompute']),
+        "--micro_batch_size",
+        str(cur_cfg['micro_batch_size']),
+        "--recompute_granularity",
+        str(cur_cfg['recompute_granularity']),
+        "--hidden_size",
+        str(model_cfg['hidden_size']),
+        "--num_attention_heads",
+        str(model_cfg['num_attention_heads']),
+        "--num_layers",
+        str(model_cfg['num_layers']),
+        "--max_sequence_length",
+        str(model_cfg['max_sequence_length']),
+        "--vocab_size",
+        str(model_cfg['vocab_size']),
+        "--intermediate_size",
+        str(model_cfg['intermediate_size']),
+    ]
     result = subprocess.run(
         memory_estimation_cmd,
-        shell=True,
         capture_output=True,
         text=True,
     )
diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py
index ddc6c411598c34..4859a438a930a7 100755
--- a/python/paddle/distributed/fleet/base/role_maker.py
+++ b/python/paddle/distributed/fleet/base/role_maker.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Definition of Role Makers."""
 import os
+import re
 import time
 import warnings
 from multiprocessing import Manager, Process
@@ -988,7 +989,9 @@ def _ps_env(self):  # each role will execute it
                     raise ValueError(
                         "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment."
                     )
-                self._stage_trainers = eval(self._stage_trainers)
+                self._stage_trainers = tuple(
+                    [int(x) for x in re.findall(r'\d+', self._stage_trainers)]
+                )
             cur_port = os.getenv("PADDLE_PORT", None)
             if cur_port is None:
                 raise ValueError(
@@ -1040,7 +1043,9 @@ def _ps_env(self):  # each role will execute it
                 raise ValueError(
                     "Can not find PADDLE_STAGE_TRAINERS_NUM, please check your environment."
                 )
-            self._stage_trainers = eval(self._stage_trainers)
+            self._stage_trainers = tuple(
+                [int(x) for x in re.findall(r'\d+', self._stage_trainers)]
+            )
 
             self._heter_trainer_device_type = os.getenv(
                 "HETER_DEVICE_TYPE", None
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 278775e3422725..2b6b6eec7748c3 100755
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -504,7 +504,7 @@ def start_local_trainers(
     else:
         current_env = copy.copy(envs)
 
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index 143db9e1033eab..05f3e2d241ab84 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -38,7 +38,7 @@ def is_fused_matmul_bias_supported():
 
 
 def is_fused_linear_param_grad_add_supported():
-    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm() and not paddle.is_compiled_with_musa():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
         return hasattr(paddle._C_ops, 'fused_linear_param_grad_add')
     else:
         return False
diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py
index 743ceac3e296cc..51aeeb6840d0a4 100644
--- a/python/paddle/distributed/fleet/utils/fs.py
+++ b/python/paddle/distributed/fleet/utils/fs.py
@@ -18,6 +18,7 @@
 import os
 import re
 import shutil
+import subprocess
 import time
 
 # (TODO: GhostScreaming) It will be removed later.
@@ -513,6 +514,34 @@ def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5):
 
         return ret, output.splitlines()
 
+    def _run_safe_cmd(self, cmd, redirect_stderr=False, retry_times=5):
+        exe_cmd = [self._base_cmd] + cmd.split()
+        ret = 0
+        output = ""
+        retry_sleep_second = 3
+        for x in range(retry_times + 1):
+            try:
+                process = subprocess.run(
+                    exe_cmd,
+                    check=True,
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.STDOUT
+                    if redirect_stderr
+                    else subprocess.PIPE,
+                    text=True,
+                )
+                output = process.stdout
+                break
+            except subprocess.CalledProcessError as e:
+                ret = e.returncode
+                output = e.output
+                time.sleep(retry_sleep_second)
+            except Exception as e:
+                break
+
+        if ret == 134:
+            raise FSShellCmdAborted(cmd)
+
     @_handle_errors()
     def list_dirs(self, fs_path):
         """
@@ -582,8 +611,8 @@ def ls_dir(self, fs_path):
         return self._ls_dir(fs_path)
 
     def _ls_dir(self, fs_path):
-        cmd = f"ls {fs_path}"
-        ret, lines = self._run_cmd(cmd)
+        cmd = ["-ls", fs_path]
+        ret, lines = self._run_safe_cmd(cmd)
 
         if ret != 0:
             raise ExecuteError(cmd)
diff --git a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
index 624cdd4d923eb1..940d7408ff5be7 100644
--- a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
@@ -220,7 +220,6 @@ def is_fused_matmul_bias_supported():
     if (
         paddle.is_compiled_with_cuda()
         and not paddle.is_compiled_with_rocm()
-        and not paddle.is_compiled_with_musa()
         or paddle.is_compiled_with_xpu()
     ):
         return hasattr(core.eager.ops.legacy, "fused_gemm_epilogue")
diff --git a/python/paddle/distributed/launch/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
index 13d8ef403504ab..9e0f46584653c8 100644
--- a/python/paddle/distributed/launch/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -45,7 +45,10 @@ def build_pod(self):
         ):
             return self._build_pod_with_args()
         else:
-            return self._build_pod_with_master()
+            if self.ctx.args.auto_parallel_config is None:
+                skip_run = True
+            # only when skip_run is Flase, should not reset pod
+            return self._build_pod_with_master(skip_run)
 
     def _build_pod_with_tuner(self):
         auto_parallel_config = self.ctx.args.auto_parallel_config
@@ -148,7 +151,7 @@ def _build_pod_with_args(self):
 
         return True
 
-    def _build_pod_with_master(self):
+    def _build_pod_with_master(self, reset_pod=True):
         self.pod.replicas = self.pod_replicas()
 
         # rank will be reset when restart
@@ -203,7 +206,8 @@ def _build_pod_with_master(self):
 
         job_endpoints = [i['endpoints'] for i in peer_list]
 
-        # self.pod.reset()
+        if reset_pod:
+            self.pod.reset()
         selected_dev_key = self.ctx.node.device.get_selected_device_key()
         selected_dev_list = self.ctx.node.device.get_selected_devices(
             self.ctx.args.devices
diff --git a/python/paddle/distributed/launch/utils/nvsmi.py b/python/paddle/distributed/launch/utils/nvsmi.py
index db494f4d8bcf0d..232ccce2209cce 100644
--- a/python/paddle/distributed/launch/utils/nvsmi.py
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -135,8 +135,6 @@ def get_gpu_util(index=None):
     )
     if paddle.device.is_compiled_with_rocm():
         return query_rocm_smi(q, index=index, dtype=d)
-    if paddle.device.is_compiled_with_musa():
-        return query_musa_smi(q, index=index, dtype=d)
     return query_smi(q, index=index, dtype=d)
 
 
diff --git a/python/paddle/distributed/rpc/rpc.py b/python/paddle/distributed/rpc/rpc.py
index 0d88c8fef1ce51..273fc1bcc0196b 100644
--- a/python/paddle/distributed/rpc/rpc.py
+++ b/python/paddle/distributed/rpc/rpc.py
@@ -142,7 +142,7 @@ def init_rpc(name, rank=None, world_size=None, master_endpoint=None):
 
 def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
     """
-    Make a blocking RPC call to run function ``fn`` on worker ``to``.
+    Make a blocking RPC call to run function ``fn`` on worker ``to``. Attention: Users must use this API in a secure network environment.
 
     Args:
         to (str): name of the destination worker.
@@ -182,7 +182,7 @@ def rpc_sync(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
 
 def rpc_async(to, fn, args=None, kwargs=None, timeout=_DEFAULT_RPC_TIMEOUT):
     """
-    Make a non-blocking RPC call to run function ``fn`` on worker ``to``.
+    Make a non-blocking RPC call to run function ``fn`` on worker ``to``. Attention: Users must use this API in a secure network environment.
 
     Args:
         to (str): name of the destination worker.
diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py
index 6dba2751dd600d..b06201dc89472f 100644
--- a/python/paddle/distributed/utils/launch_utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -467,7 +467,7 @@ def start_local_trainers(
     cluster, pod, training_script, training_script_args, log_dir=None
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/python/paddle/hapi/hub.py b/python/paddle/hapi/hub.py
index c39fa57ad56816..f25640804fdbcc 100644
--- a/python/paddle/hapi/hub.py
+++ b/python/paddle/hapi/hub.py
@@ -117,7 +117,6 @@ def _get_cache_or_reload(repo, force_reload, verbose=True, source='github'):
             hub_dir,
             check_exist=not force_reload,
             decompress=False,
-            method=('wget' if source == 'gitee' else 'get'),
         )
         shutil.move(fpath, cached_file)
 
diff --git a/python/paddle/incubate/distributed/fleet/fleet_util.py b/python/paddle/incubate/distributed/fleet/fleet_util.py
index 1777afffe9aaf4..9af91e4f5b148c 100644
--- a/python/paddle/incubate/distributed/fleet/fleet_util.py
+++ b/python/paddle/incubate/distributed/fleet/fleet_util.py
@@ -18,6 +18,7 @@
 import logging
 import math
 import os
+import re
 import sys
 import time
 
@@ -1317,23 +1318,12 @@ def get_online_pass_interval(
                 ...     is_data_hourly_placed=False)
 
         """
-        assert (
-            "|" not in days
-            and ";" not in days
-            and "\\" not in days
-            and "/" not in days
-            and "(" not in days
-            and ")" not in days
-        ), r"days should not contain [|,;,\,/,(,)]"
+        pattern = r'^\d+|{[0-9]+}|{[0-9]+\.\.[0-9]+}$'
+        if not re.fullmatch(pattern, str(days)):
+            raise Exception("days format is not right")
         days = os.popen("echo -n " + days).read().split(" ")
-        assert (
-            "|" not in hours
-            and ";" not in hours
-            and "\\" not in hours
-            and "/" not in hours
-            and "(" not in hours
-            and ")" not in days
-        ), r"hours should not contain [|,;,\,/,(,)]"
+        if not re.fullmatch(pattern, str(hours)):
+            raise Exception("hours format is not right")
         hours = os.popen("echo -n " + hours).read().split(" ")
         split_interval = int(split_interval)
         split_per_pass = int(split_per_pass)
diff --git a/python/paddle/io/dataloader/dataloader_iter.py b/python/paddle/io/dataloader/dataloader_iter.py
index ed64f5da3d9e93..84260dc90ca562 100644
--- a/python/paddle/io/dataloader/dataloader_iter.py
+++ b/python/paddle/io/dataloader/dataloader_iter.py
@@ -704,10 +704,11 @@ def _get_data(self):
                 if len(failed_workers) > 0:
                     self._exit_thread_unexpectedly()
                     pids = ', '.join(str(w.pid) for w in failed_workers)
-                    raise RuntimeError(
-                        f"DataLoader {len(failed_workers)} workers exit unexpectedly, "
-                        f"pids: {pids}"
+                    logging.warning(
+                        "DataLoader {} workers exit unexpectedly, "
+                        "pids: {}".format(len(failed_workers), pids)
                     )
+                    return
 
                 # get(timeout) will call _poll(timeout) and may raise IOError
                 if isinstance(e, (IOError, queue.Empty)):
diff --git a/python/paddle/jit/dy2static/convert_operators.py b/python/paddle/jit/dy2static/convert_operators.py
index a6cfb4cd8c3993..bf44c2a47dcbb0 100644
--- a/python/paddle/jit/dy2static/convert_operators.py
+++ b/python/paddle/jit/dy2static/convert_operators.py
@@ -726,7 +726,12 @@ def convert_var_dtype(var, dtype):
         }
         return paddle.cast(var, dtype=cast_map[dtype])
     else:
-        return eval(f'{dtype}(var)')
+        assert dtype in [
+            'bool',
+            'int',
+            'float',
+        ], f"The casted target dtype is {dtype}, which is not supported in type casting."
+        return eval(dtype)(var)
 
 
 def convert_assert(cond, message=""):
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index e2ea5644a8847b..df837215993541 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -18,7 +18,6 @@
     get_all_custom_device_type,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
-    is_compiled_with_musa
 )
 from paddle.tensor.manipulation import reshape
 from paddle.tensor.math import _add_with_axis
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 7a64da7d1e84fe..63d1c1c19403ac 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -18,7 +18,7 @@
 from ...base.data_feeder import check_variable_and_dtype
 from ...base.layer_helper import LayerHelper
 from ...common_ops_import import Variable
-from ...device import get_cudnn_version, is_compiled_with_rocm,is_compiled_with_musa
+from ...device import get_cudnn_version, is_compiled_with_rocm
 
 __all__ = []
 
diff --git a/python/paddle/nn/quant/format.py b/python/paddle/nn/quant/format.py
index 2e58f1ef2b8b62..0074389b3bc554 100644
--- a/python/paddle/nn/quant/format.py
+++ b/python/paddle/nn/quant/format.py
@@ -46,7 +46,14 @@ def from_quanter(quanter):
 
 
 class LinearQuanter(Layer):
-    def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
+    def __init__(
+        self,
+        scales,
+        zero_point=None,
+        quant_axis=None,
+        bit_length=8,
+        group_size=128,
+    ):
         super().__init__()
         scales = paddle.to_tensor(scales, dtype="float32")
         scale_attr = paddle.framework.ParamAttr(
@@ -65,9 +72,21 @@ def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
         )
         self._quant_axis = -1 if quant_axis is None else quant_axis
         self._bit_length = bit_length
+        self._group_size = group_size
 
     def forward(self, input):
         if in_dynamic_mode():
+            if len(self._scales.shape) > 1:
+                bnt = (1 << (self._bit_length - 1)) - 1
+                new_s = paddle.repeat_interleave(
+                    self._scales, self._group_size, 0
+                )
+                quant_weight = paddle.clip(
+                    paddle.round(input.cast('float32') / new_s * bnt),
+                    -bnt - 1,
+                    bnt,
+                )
+                return quant_weight.cast(input.dtype)
             return _C_ops.quantize_linear(
                 input.cast('float32'),
                 self._scales,
@@ -105,7 +124,14 @@ def from_quanter(quanter):
 
 
 class LinearDequanter(Layer):
-    def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
+    def __init__(
+        self,
+        scales,
+        zero_point=None,
+        quant_axis=None,
+        bit_length=8,
+        group_size=128,
+    ):
         super().__init__()
         scales = paddle.to_tensor(scales, dtype="float32")
         scale_attr = paddle.framework.ParamAttr(
@@ -124,9 +150,18 @@ def __init__(self, scales, zero_point=None, quant_axis=None, bit_length=8):
         )
         self._quant_axis = -1 if quant_axis is None else quant_axis
         self._bit_length = bit_length
+        self._group_size = group_size
 
     def forward(self, input):
         if in_dynamic_mode():
+            if len(self._scales.shape) > 1:
+                bnt = (1 << (self._bit_length - 1)) - 1
+                new_s = paddle.repeat_interleave(
+                    self._scales, self._group_size, 0
+                )
+                quant_dequant_weight = input.cast('float32') / bnt * new_s
+                return quant_dequant_weight.cast(input.dtype)
+
             return _C_ops.dequantize_linear(
                 input.cast('float32'),
                 self._scales,
diff --git a/python/paddle/quantization/observers/__init__.py b/python/paddle/quantization/observers/__init__.py
index 733b3e7dbb9812..9bb662b53626ea 100644
--- a/python/paddle/quantization/observers/__init__.py
+++ b/python/paddle/quantization/observers/__init__.py
@@ -14,5 +14,6 @@
 # limitations under the License.
 
 from .abs_max import AbsmaxObserver
+from .groupwise import GroupWiseWeightObserver
 
-__all__ = ["AbsmaxObserver"]
+__all__ = ["AbsmaxObserver", "GroupWiseWeightObserver"]
diff --git a/python/paddle/quantization/observers/groupwise.py b/python/paddle/quantization/observers/groupwise.py
new file mode 100644
index 00000000000000..9d30a7101c1128
--- /dev/null
+++ b/python/paddle/quantization/observers/groupwise.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+
+from ..base_observer import BaseObserver
+from ..factory import ObserverFactory
+
+
+class GroupWiseWeightObserver(ObserverFactory):
+    r"""
+    It collects channel-wise maximum absolute values of target weights.
+    Args:
+        bit_length(int, optional): Number of bits to represent an quantized integer in binary.
+        dtype(str, optional): The data type of input tensor.
+        name (str, optional): This parameter is used by developers to print debugging information. \
+            For details, please refer to :ref:`api_guide_Name`. Default is None.
+    Examples:
+       .. code-block:: python
+            from paddle.quantization import QuantConfig
+            from paddle.quantization.quanters import AbsMaxChannelWiseWeightObserver
+            quanter = AbsMaxChannelWiseWeightObserver()
+            q_config = QuantConfig(activation=None, weight=quanter)
+    """
+
+    def __init__(self, quant_bits=8, group_size=128):
+        super().__init__(quant_bits=quant_bits)
+
+    def _get_class(self):
+        return GroupWiseWeightObserverLayer
+
+
+class GroupWiseWeightObserverLayer(BaseObserver):
+    def __init__(self, layer, quant_bits=8, group_size=128):
+        super().__init__()
+        self.quant_bits = quant_bits
+        self.group_size = group_size
+        self._layer = layer
+        self._max = None
+        self._scale = None
+        self._zero_point = None
+
+    def forward(self, inputs):
+        self._max = self._cal_abs_max(inputs)
+        return inputs
+
+    def _cal_abs_max(self, inputs):
+        """Use group_size to group the input, then use the
+        absmax method to calculate the scale
+        """
+        input_shape = inputs.shape
+        assert (
+            self.group_size == 64 or self.group_size == 128
+        ), "group_size only support 64 or 128"
+        assert (
+            inputs.shape[0] % self.group_size == 0
+        ), "group_size must be a factor of input channels"
+        assert len(inputs.shape) == 2, "Currently only support 2D tensor"
+        input_processed = inputs.transpose([1, 0]).reshape(
+            [input_shape[1], input_shape[0] // self.group_size, self.group_size]
+        )
+
+        abs_max_values = paddle.max(paddle.abs(input_processed), axis=2).cast(
+            "float32"
+        )
+        abs_max_values = paddle.where(
+            abs_max_values == np.float32(0), np.float32(1e-8), abs_max_values
+        )
+        abs_max_values = abs_max_values.transpose([1, 0])
+        return abs_max_values
+
+    def min_value(self) -> float:
+        return 0.0
+
+    def max_value(self) -> float:
+        return self._max
+
+    def bit_length(self):
+        return self._quant_bits
+
+    def quant_axis(self):
+        return -1
+
+    def cal_thresholds(self):
+        """Compute thresholds for MAX function."""
+        if self._scale is None:
+            self._scale = self._max
+        self._zero_point = paddle.zeros_like(self._scale)
+
+    def scales(self):
+        """Return output scales."""
+        if self._scale is None:
+            self.cal_thresholds()
+        return self._scale
+
+    def zero_points(self):
+        """Return output zero points."""
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point
diff --git a/python/paddle/quantization/quantize.py b/python/paddle/quantization/quantize.py
index b7887ffc46e1c4..7606c4bb3e1827 100644
--- a/python/paddle/quantization/quantize.py
+++ b/python/paddle/quantization/quantize.py
@@ -28,8 +28,9 @@
 class Quantization(metaclass=abc.ABCMeta):
     r"""
     Abstract class used to prepares a copy of the model for quantization calibration or quantization-aware training.
+
     Args:
-        config(QuantConfig) - Quantization configuration
+        config(QuantConfig): Quantization configuration
     """
 
     def __init__(self, config: QuantConfig):
@@ -43,10 +44,11 @@ def quantize(self, model: Layer, inplace=False):
     def convert(self, model: Layer, inplace=False, remain_weight=False):
         r"""Convert the quantization model to ONNX style. And the converted
         model can be saved as inference model by calling paddle.jit.save.
+
         Args:
-            model(Layer) - The quantized model to be converted.
-            inplace(bool, optional) - Whether to modify the model in-place, default is False.
-            remain_weight(bool, optional) - Whether to remain weights in floats, default is False.
+            model(Layer): The quantized model to be converted.
+            inplace(bool, optional): Whether to modify the model in-place, default is False.
+            remain_weight(bool, optional): Whether to remain weights in floats, default is False.
 
         Return: The converted model
 
@@ -72,7 +74,12 @@ def convert(self, model: Layer, inplace=False, remain_weight=False):
         for name, child in _model.named_children():
             quant_dequant = None
             if isinstance(child, ConvertibleQuantedLayer):
-                if child.weight_quanter.scales() is None:
+                if child.converted:
+                    continue
+                if (
+                    child.weight_quanter is None
+                    or child.weight_quanter.scales() is None
+                ):
                     continue
                 child._convert(remain_weight=remain_weight)
             elif isinstance(child, BaseQuanter):
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 87cb258952f9e4..d8ee8698e2d70e 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -5308,11 +5308,12 @@ def put_along_axis(
     Args:
         arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr,
-            and need to broadcast against arr. Supported data type are int and int64.
+            and need to broadcast against arr if broadcast is 'True'. Supported data type are int and int64.
+        values (Tensor) : The value element(s) to put. The data types should be same as arr.
         axis (int) : The axis to put 1d slices along.
-        reduce (str, optional): The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'.
-        include_self (bool, optional): whether to reduce with the elements of arr. (Only support True now)
-        broadcast (bool, optional): whether to broadcast indices.
+        reduce (str, optional): The reduce operation, default is 'assign', support 'add', 'assign', 'mul', 'multiply', "mean", "amin" and "amax".
+        include_self (bool, optional): whether to reduce with the elements of arr, default is 'True'.
+        broadcast (bool, optional): whether to broadcast indices, default is 'True'.
 
     Returns:
         Tensor, The indexed element, same dtype with arr
@@ -5332,9 +5333,45 @@ def put_along_axis(
             [[99, 99, 99],
              [60, 40, 50]])
 
+            >>> index = paddle.zeros((2,2)).astype("int32")
+            >>> value=paddle.to_tensor([[1,2],[3,4]]).astype(x.dtype)
+            >>> result = paddle.put_along_axis(x, index, value, 0, "add", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[14, 36, 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "mul", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[30 , 240, 20 ],
+             [60 , 40 , 50 ]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "mean", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[4 , 12, 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "amin", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[1 , 2 , 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "amax", True, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[10, 30, 20],
+             [60, 40, 50]])
+
+            >>> result = paddle.put_along_axis(x, index, value, 0, "add", False, False)
+            >>> print(result)
+            Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            [[4 , 6 , 20],
+             [60, 40, 50]])
+
     """
-    if not include_self:
-        raise ValueError("`include_self` is only support True now.")
     if len(arr.shape) != len(indices.shape):
         raise ValueError(
             "`indices` and `arr` must have the same number of dimensions!"
@@ -5381,7 +5418,15 @@ def put_along_axis(
                 )
             )
     if in_dynamic_or_pir_mode():
-        return _C_ops.put_along_axis(arr, indices, values, axis, reduce)
+        if convert_dtype(indices.dtype) not in ['int32', 'int64']:
+            raise TypeError(
+                "The data type of indices should be one of ['int32', 'int64'], but got {}".format(
+                    str(convert_dtype(indices.dtype))
+                )
+            )
+        return _C_ops.put_along_axis(
+            arr, indices, values, axis, reduce, include_self
+        )
     else:
         check_variable_and_dtype(
             arr,
@@ -5400,20 +5445,27 @@ def put_along_axis(
         check_variable_and_dtype(
             indices, 'index', ['int32', 'int64'], 'put_along_axis'
         )
+        check_type(include_self, 'include_self', bool, 'put_along_axis')
         helper = LayerHelper('put_along_axis', **locals())
         dtype = helper.input_dtype()
         result = helper.create_variable_for_type_inference(dtype)
         helper.append_op(
             type="put_along_axis",
             inputs={"Input": arr, "Index": indices, "Value": values},
-            attrs={"Axis": axis, "Reduce": reduce},
+            attrs={
+                "Axis": axis,
+                "Reduce": reduce,
+                "Include_self": include_self,
+            },
             outputs={"Result": result},
         )
         return result
 
 
 @inplace_apis_in_dygraph_only
-def put_along_axis_(arr, indices, values, axis, reduce='assign'):
+def put_along_axis_(
+    arr, indices, values, axis, reduce='assign', include_self=True
+):
     r"""
     Inplace version of ``put_along_axis`` API, the output Tensor will be inplaced with input ``arr``.
     Please refer to :ref:`api_paddle_put_along_axis`.
@@ -5432,7 +5484,9 @@ def put_along_axis_(arr, indices, values, axis, reduce='assign'):
     if broadcast_shape:
         indices = paddle.broadcast_to(indices, broadcast_shape)
     values = paddle.broadcast_to(values, indices.shape)
-    return _C_ops.put_along_axis_(arr, indices, values, axis, reduce)
+    return _C_ops.put_along_axis_(
+        arr, indices, values, axis, reduce, include_self
+    )
 
 
 def index_add(x, index, axis, value, name=None):
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index e64f5e6a25b3f6..2ebff3f5cf25de 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -1326,6 +1326,7 @@ def _jit_compile(file_path, verbose=False):
     """
     Build shared library in subprocess
     """
+    assert os.path.exists(file_path)
     ext_dir = os.path.dirname(file_path)
     setup_file = os.path.basename(file_path)
 
diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py
index b9ca1f35976c63..de1c36bdddfab1 100644
--- a/python/paddle/utils/download.py
+++ b/python/paddle/utils/download.py
@@ -16,12 +16,10 @@
 import os
 import os.path as osp
 import shutil
-import subprocess
 import sys
 import tarfile
 import time
 import zipfile
-from urllib.parse import urlparse
 
 import httpx
 
@@ -197,39 +195,7 @@ def _get_download(url, fullname):
         return False
 
 
-def _wget_download(url: str, fullname: str):
-    try:
-        assert urlparse(url).scheme in (
-            'http',
-            'https',
-        ), 'Only support https and http url'
-        # using wget to download url
-        tmp_fullname = fullname + "_tmp"
-        # –user-agent
-        command = f'wget -O {tmp_fullname} -t {DOWNLOAD_RETRY_LIMIT} {url}'
-        subprc = subprocess.Popen(
-            command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
-        )
-        _ = subprc.communicate()
-
-        if subprc.returncode != 0:
-            raise RuntimeError(
-                f'{command} failed. Please make sure `wget` is installed or {url} exists'
-            )
-
-        shutil.move(tmp_fullname, fullname)
-
-    except Exception as e:  # requests.exceptions.ConnectionError
-        logger.info(f"Downloading {url} failed with exception {str(e)}")
-        return False
-
-    return fullname
-
-
-_download_methods = {
-    'get': _get_download,
-    'wget': _wget_download,
-}
+_download_methods = {'get': _get_download}
 
 
 def _download(url, path, md5sum=None, method='get'):
@@ -311,7 +277,10 @@ def _decompress(fname):
 
 def _uncompress_file_zip(filepath):
     with zipfile.ZipFile(filepath, 'r') as files:
-        file_list = files.namelist()
+        file_list_tmp = files.namelist()
+        file_list = []
+        for file in file_list_tmp:
+            file_list.append(file.replace("../", ""))
 
         file_dir = os.path.dirname(filepath)
 
@@ -340,7 +309,13 @@ def _uncompress_file_zip(filepath):
 
 def _uncompress_file_tar(filepath, mode="r:*"):
     with tarfile.open(filepath, mode) as files:
-        file_list = files.getnames()
+        file_list_tmp = files.getnames()
+        file_list = []
+        for file in file_list_tmp:
+            assert (
+                file[0] != "/"
+            ), f"uncompress file path {file} should not start with /"
+            file_list.append(file.replace("../", ""))
 
         file_dir = os.path.dirname(filepath)
 
diff --git a/python/setup.py.in b/python/setup.py.in
index a42163b296dbdd..d2dd7f206f671f 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -809,7 +809,7 @@ for f in jit_layer_headers:
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
 
-if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON' or '${WITH_MUSA}' == 'ON':
+if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
     # externalErrorMsg.pb for External Error message
     headers += list(find_files('*.pb', '${externalError_INCLUDE_DIR}'))
 
diff --git a/security/README.md b/security/README.md
index 01559632d7dd45..7a1c6df5a5f7a5 100644
--- a/security/README.md
+++ b/security/README.md
@@ -7,12 +7,30 @@ We regularly publish security advisories about using PaddlePaddle.
 *Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](../SECURITY.md).
 
 
-| Advisory Number                              | Type                                                 | Versions affected | Reported by                                                      | Additional Information |
-|----------------------------------------------|------------------------------------------------------|:-----------------:|------------------------------------------------------------------|------------------------|
-| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |      < 2.5.0      | Xiaochen Guo from Huazhong University of Science and Technology  |                        |
-| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window |    = 2.4.0-rc0    | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |       < 2.4       | Wang Xuan(王旋) of Qihoo 360 AIVul Team                            |                        |
+| Advisory Number                              | Type                                                 | Versions affected | Reported by                                                     | Additional Information |
+|----------------------------------------------|------------------------------------------------------|:-----------------:|-----------------------------------------------------------------|------------------------|
+| [PDSA-2023-023](./advisory/pdsa-2023-023.md) | Command injection in convert_shape_compare           |      < 2.6.0      | leeya_bug                                                       |                        |
+| [PDSA-2023-022](./advisory/pdsa-2023-022.md) | FPE in paddle.argmin and paddle.argmax               |      < 2.6.0      | Peng Zhou (zpbrent) from Shanghai University                    |                        |
+| [PDSA-2023-021](./advisory/pdsa-2023-021.md) | Null pointer dereference in paddle.crop              |      < 2.6.0      | Peng Zhou (zpbrent) from Shanghai University                    |                        |
+| [PDSA-2023-020](./advisory/pdsa-2023-020.md) | Command injection in _wget_download                  |      < 2.6.0      | huntr.com                                                       |                        |
+| [PDSA-2023-019](./advisory/pdsa-2023-019.md) | Command injection in get_online_pass_interval        |      < 2.6.0      | huntr.com and leeya_bug                                         |                        |
+| [PDSA-2023-018](./advisory/pdsa-2023-018.md) | Heap buffer overflow in paddle.repeat_interleave     |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-017](./advisory/pdsa-2023-017.md) | FPE in paddle.amin                                   |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-016](./advisory/pdsa-2023-016.md) | Stack overflow in paddle.linalg.lu_unpack            |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-015](./advisory/pdsa-2023-015.md) | FPE in paddle.lerp                                   |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-014](./advisory/pdsa-2023-014.md) | FPE in paddle.topk                                   |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-013](./advisory/pdsa-2023-013.md) | Stack overflow in paddle.searchsorted                |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-012](./advisory/pdsa-2023-012.md) | Segfault in paddle.put_along_axis                    |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-011](./advisory/pdsa-2023-011.md) | Null pointer dereference in paddle.nextafter         |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-010](./advisory/pdsa-2023-010.md) | Segfault in paddle.mode                              |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-009](./advisory/pdsa-2023-009.md) | FPE in paddle.linalg.eig                             |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-008](./advisory/pdsa-2023-008.md) | Segfault in paddle.dot                               |      < 2.6.0      | Tong Liu of CAS-IIE                                             |                        |
+| [PDSA-2023-007](./advisory/pdsa-2023-007.md) | FPE in paddle.linalg.matrix_rank                     |      < 2.6.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-006](./advisory/pdsa-2023-006.md) | FPE in paddle.nanmedian                              |      < 2.6.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |      < 2.5.0      | Xiaochen Guo from Huazhong University of Science and Technology |                        |
+| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window |    = 2.4.0-rc0    | Tong Liu of ShanghaiTech University                             |                        |
+| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |       < 2.4       | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |                        |
diff --git a/security/README_cn.md b/security/README_cn.md
index 49223df8844f39..7022221643a429 100644
--- a/security/README_cn.md
+++ b/security/README_cn.md
@@ -4,15 +4,33 @@
 
 
 
-注：我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](../SECURITY_cn.md)所介绍的飞桨安全模型，以便更好地了解此安全公告。
+*注*：我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](../SECURITY_cn.md)所介绍的飞桨安全模型，以便更好地了解此安全公告。
 
 
-| 安全公告编号                                          | 类型                                                   |    受影响版本     | 报告者                                                             | 备注 |
-|-------------------------------------------------|------------------------------------------------------|:------------:|-----------------------------------------------------------------|----|
-| [PDSA-2023-005](./advisory/pdsa-2023-005_cn.md) | Command injection in fs.py                           |   < 2.5.0    | Xiaochen Guo from Huazhong University of Science and Technology |    |
-| [PDSA-2023-004](./advisory/pdsa-2023-004_cn.md) | FPE in paddle.linalg.matrix_power                    |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2023-003](./advisory/pdsa-2023-003_cn.md) | Heap buffer overflow in paddle.trace                 |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2023-002](./advisory/pdsa-2023-002_cn.md) | Null pointer dereference in paddle.flip              |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2023-001](./advisory/pdsa-2023-001_cn.md) | Use after free in paddle.diagonal                    |   < 2.5.0    | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0  | Tong Liu of ShanghaiTech University                             |    |
-| [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree                              |    < 2.4     | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |    |
+| 安全公告编号                                          | 类型                                                   |    受影响版本    | 报告者                                                             | 备注 |
+|-------------------------------------------------|------------------------------------------------------|:-----------:|-----------------------------------------------------------------|----|
+| [PDSA-2023-023](./advisory/pdsa-2023-023_cn.md) | Command injection in convert_shape_compare           |   < 2.6.0   | leeya_bug                                                       |    |
+| [PDSA-2023-022](./advisory/pdsa-2023-022_cn.md) | FPE in paddle.argmin and paddle.argmax               |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |    |
+| [PDSA-2023-021](./advisory/pdsa-2023-021_cn.md) | Null pointer dereference in paddle.crop              |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |    |
+| [PDSA-2023-020](./advisory/pdsa-2023-020_cn.md) | Command injection in _wget_download                  |   < 2.6.0   | huntr.com                                                       |    |
+| [PDSA-2023-019](./advisory/pdsa-2023-019_cn.md) | Command injection in get_online_pass_interval        |   < 2.6.0   | huntr.com and leeya_bug                                         |    |
+| [PDSA-2023-018](./advisory/pdsa-2023-018_cn.md) | Heap buffer overflow in paddle.repeat_interleave     |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-017](./advisory/pdsa-2023-017_cn.md) | FPE in paddle.amin                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-016](./advisory/pdsa-2023-016_cn.md) | Stack overflow in paddle.linalg.lu_unpack            |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-015](./advisory/pdsa-2023-015_cn.md) | FPE in paddle.lerp                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-014](./advisory/pdsa-2023-014_cn.md) | FPE in paddle.topk                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-013](./advisory/pdsa-2023-013_cn.md) | Stack overflow in paddle.searchsorted                |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-012](./advisory/pdsa-2023-012_cn.md) | Segfault in paddle.put_along_axis                    |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-011](./advisory/pdsa-2023-011_cn.md) | Null pointer dereference in paddle.nextafter         |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-010](./advisory/pdsa-2023-010_cn.md) | Segfault in paddle.mode                              |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-009](./advisory/pdsa-2023-009_cn.md) | FPE in paddle.linalg.eig                             |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-008](./advisory/pdsa-2023-008_cn.md) | Segfault in paddle.dot                               |   < 2.6.0   | Tong Liu of CAS-IIE                                             |    |
+| [PDSA-2023-007](./advisory/pdsa-2023-007_cn.md) | FPE in paddle.linalg.matrix_rank                     |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-006](./advisory/pdsa-2023-006_cn.md) | FPE in paddle.nanmedian                              |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-005](./advisory/pdsa-2023-005_cn.md) | Command injection in fs.py                           |   < 2.5.0   | Xiaochen Guo from Huazhong University of Science and Technology |    |
+| [PDSA-2023-004](./advisory/pdsa-2023-004_cn.md) | FPE in paddle.linalg.matrix_power                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-003](./advisory/pdsa-2023-003_cn.md) | Heap buffer overflow in paddle.trace                 |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-002](./advisory/pdsa-2023-002_cn.md) | Null pointer dereference in paddle.flip              |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2023-001](./advisory/pdsa-2023-001_cn.md) | Use after free in paddle.diagonal                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University                             |    |
+| [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree                              |    < 2.4    | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |    |
diff --git a/security/README_ja.md b/security/README_ja.md
index 4bd0b984c5834c..2711a91396b5e5 100644
--- a/security/README_ja.md
+++ b/security/README_ja.md
@@ -7,12 +7,30 @@ PaddlePaddle の使用に関するセキュリティ勧告を定期的に発表
 *注*: これらのセキュリティ勧告と併せ、PaddlePaddle ユーザーには [SECURITY.md](../SECURITY_ja.md) に記載されている PaddlePaddle のセキュリティモデルを読み、理解することを強くお勧めします。
 
 
-| アドバイザリー番号                              | タイプ                                                 | 対象バージョン | 報告者                                                      | 追加情報 |
-|----------------------------------------------|------------------------------------------------------|:-----------------:|------------------------------------------------------------------|------------------------|
-| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |      < 2.5.0      | Xiaochen Guo from Huazhong University of Science and Technology  |                        |
-| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |      < 2.5.0      | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window |    = 2.4.0-rc0    | Tong Liu of ShanghaiTech University                              |                        |
-| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |       < 2.4       | Wang Xuan(王旋) of Qihoo 360 AIVul Team                            |                        |
+| アドバイザリー番号                                    | タイプ                                                  |   対象バージョン   | 報告者                                                             | 追加情報 |
+|----------------------------------------------|------------------------------------------------------|:-----------:|-----------------------------------------------------------------|------|
+| [PDSA-2023-023](./advisory/pdsa-2023-023.md) | Command injection in convert_shape_compare           |   < 2.6.0   | leeya_bug                                                       |      |
+| [PDSA-2023-022](./advisory/pdsa-2023-022.md) | FPE in paddle.argmin and paddle.argmax               |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |      |
+| [PDSA-2023-021](./advisory/pdsa-2023-021.md) | Null pointer dereference in paddle.crop              |   < 2.6.0   | Peng Zhou (zpbrent) from Shanghai University                    |      |
+| [PDSA-2023-020](./advisory/pdsa-2023-020.md) | Command injection in _wget_download                  |   < 2.6.0   | huntr.com                                                       |      |
+| [PDSA-2023-019](./advisory/pdsa-2023-019.md) | Command injection in get_online_pass_interval        |   < 2.6.0   | huntr.com and leeya_bug                                         |      |
+| [PDSA-2023-018](./advisory/pdsa-2023-018.md) | Heap buffer overflow in paddle.repeat_interleave     |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-017](./advisory/pdsa-2023-017.md) | FPE in paddle.amin                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-016](./advisory/pdsa-2023-016.md) | Stack overflow in paddle.linalg.lu_unpack            |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-015](./advisory/pdsa-2023-015.md) | FPE in paddle.lerp                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-014](./advisory/pdsa-2023-014.md) | FPE in paddle.topk                                   |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-013](./advisory/pdsa-2023-013.md) | Stack overflow in paddle.searchsorted                |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-012](./advisory/pdsa-2023-012.md) | Segfault in paddle.put_along_axis                    |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-011](./advisory/pdsa-2023-011.md) | Null pointer dereference in paddle.nextafter         |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-010](./advisory/pdsa-2023-010.md) | Segfault in paddle.mode                              |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-009](./advisory/pdsa-2023-009.md) | FPE in paddle.linalg.eig                             |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-008](./advisory/pdsa-2023-008.md) | Segfault in paddle.dot                               |   < 2.6.0   | Tong Liu of CAS-IIE                                             |      |
+| [PDSA-2023-007](./advisory/pdsa-2023-007.md) | FPE in paddle.linalg.matrix_rank                     |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-006](./advisory/pdsa-2023-006.md) | FPE in paddle.nanmedian                              |   < 2.6.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-005](./advisory/pdsa-2023-005.md) | Command injection in fs.py                           |   < 2.5.0   | Xiaochen Guo from Huazhong University of Science and Technology |      |
+| [PDSA-2023-004](./advisory/pdsa-2023-004.md) | FPE in paddle.linalg.matrix_power                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-003](./advisory/pdsa-2023-003.md) | Heap buffer overflow in paddle.trace                 |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-002](./advisory/pdsa-2023-002.md) | Null pointer dereference in paddle.flip              |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2023-001](./advisory/pdsa-2023-001.md) | Use after free in paddle.diagonal                    |   < 2.5.0   | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University                             |      |
+| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree                              |    < 2.4    | Wang Xuan(王旋) of Qihoo 360 AIVul Team                           |      |
diff --git a/security/advisory/pdsa-2023-004_cn.md b/security/advisory/pdsa-2023-004_cn.md
index c31c4da4f8728f..11f22a45aca11c 100644
--- a/security/advisory/pdsa-2023-004_cn.md
+++ b/security/advisory/pdsa-2023-004_cn.md
@@ -6,7 +6,7 @@ CVE-2023-38672
 
 ### 影响
 
-当张量包含纬度值为0的情况，`paddle.linalg.matrix_power`会触发除0异常，导致程序运行时崩溃，PoC代码如下：
+当张量包含维度值为0的情况，`paddle.linalg.matrix_power`会触发除0异常，导致程序运行时崩溃，PoC代码如下：
 
 ```python
 import paddle
diff --git a/security/advisory/pdsa-2023-006.md b/security/advisory/pdsa-2023-006.md
new file mode 100644
index 00000000000000..4997760cd5000a
--- /dev/null
+++ b/security/advisory/pdsa-2023-006.md
@@ -0,0 +1,31 @@
+## PDSA-2023-006: FPE in paddle.nanmedian
+
+### CVE Number
+
+CVE-2023-38674
+
+### Impact
+
+When `x` dim calculates `stride` to 0, `paddle.nanmedian` triggers FPE by `numel / stride`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.nanmedian(x)
+```
+
+### Patches
+
+We have patched the issue in commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of ShanghaiTech University.
diff --git a/security/advisory/pdsa-2023-006_cn.md b/security/advisory/pdsa-2023-006_cn.md
new file mode 100644
index 00000000000000..e8ac803c033d6a
--- /dev/null
+++ b/security/advisory/pdsa-2023-006_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-006: FPE in paddle.nanmedian
+
+### CVE编号
+
+CVE-2023-38674
+
+### 影响
+
+当由`x`的dim计算的`stride`为0时，`paddle.nanmedian`会由`numel / stride`触发除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.nanmedian(x)
+```
+
+### 补丁
+
+我们在commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of ShanghaiTech University 提交。
diff --git a/security/advisory/pdsa-2023-007.md b/security/advisory/pdsa-2023-007.md
new file mode 100644
index 00000000000000..f61223193cabfe
--- /dev/null
+++ b/security/advisory/pdsa-2023-007.md
@@ -0,0 +1,31 @@
+## PDSA-2023-007: FPE in paddle.linalg.matrix_rank
+
+### CVE Number
+
+CVE-2023-38675
+
+### Impact
+
+When `x` dim calculates `rows` or `cols` to 0, `paddle.linalg.matrix_rank` triggers FPE by `numel / (rows * cols)`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.linalg.matrix_rank(x)
+```
+
+### Patches
+
+We have patched the issue in commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of ShanghaiTech University.
diff --git a/security/advisory/pdsa-2023-007_cn.md b/security/advisory/pdsa-2023-007_cn.md
new file mode 100644
index 00000000000000..0572aa1767b36d
--- /dev/null
+++ b/security/advisory/pdsa-2023-007_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-007: FPE in paddle.linalg.matrix_rank
+
+### CVE编号
+
+CVE-2023-38675
+
+### 影响
+
+当由`x`的dim计算的`rows`或者`cols`为0时，`paddle.linalg.matrix_rank`会由`numel / (rows * cols)`触发除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = np.random.uniform(0,0,[0,0,0,0,0]).astype(np.float32)
+x = paddle.to_tensor(x)
+paddle.linalg.matrix_rank(x)
+```
+
+### 补丁
+
+我们在commit [9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1](https://github.com/PaddlePaddle/Paddle/pull/55644/commits/9bb6c669206c4bcc3ce3f6daf8a55650e190c1a1)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of ShanghaiTech University 提交。
diff --git a/security/advisory/pdsa-2023-008.md b/security/advisory/pdsa-2023-008.md
new file mode 100644
index 00000000000000..8994abd90fc23e
--- /dev/null
+++ b/security/advisory/pdsa-2023-008.md
@@ -0,0 +1,31 @@
+## PDSA-2023-008: Segfault in paddle.dot
+
+### CVE Number
+
+CVE-2023-38676
+
+### Impact
+
+Segfault occurs when `x` and `y` shape is 0 in `paddle.dot`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+paddle.dot(x, y)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-008_cn.md b/security/advisory/pdsa-2023-008_cn.md
new file mode 100644
index 00000000000000..92052de2f38090
--- /dev/null
+++ b/security/advisory/pdsa-2023-008_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-008: Segfault in paddle.dot
+
+### CVE编号
+
+CVE-2023-38676
+
+### 影响
+
+在`paddle.dot`中当`x`和`y`的shape为0时，将造成segfault，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0]).astype(np.float32))
+paddle.dot(x, y)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-009.md b/security/advisory/pdsa-2023-009.md
new file mode 100644
index 00000000000000..2f0450f9eb4e32
--- /dev/null
+++ b/security/advisory/pdsa-2023-009.md
@@ -0,0 +1,31 @@
+## PDSA-2023-009: FPE in paddle.linalg.eig
+
+### CVE Number
+
+CVE-2023-38677
+
+### Impact
+
+When tensor dims contain 0, `paddle.linalg.eig` will trigger a float point exception. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [3, 6, 0, 2, 2]).astype(np.float32))
+
+paddle.linalg.eig(x)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-009_cn.md b/security/advisory/pdsa-2023-009_cn.md
new file mode 100644
index 00000000000000..a212a2320c8902
--- /dev/null
+++ b/security/advisory/pdsa-2023-009_cn.md
@@ -0,0 +1,31 @@
+## PDSA-2023-009: FPE in paddle.linalg.eig
+
+### CVE编号
+
+CVE-2023-38677
+
+### 影响
+
+当张量包含维度值为0的情况，`paddle.linalg.eig`会触发除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [3, 6, 0, 2, 2]).astype(np.float32))
+
+paddle.linalg.eig(x)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-010.md b/security/advisory/pdsa-2023-010.md
new file mode 100644
index 00000000000000..3f1c65f6c91c4f
--- /dev/null
+++ b/security/advisory/pdsa-2023-010.md
@@ -0,0 +1,33 @@
+## PDSA-2023-010: Segfault in paddle.mode
+
+### CVE Number
+
+CVE-2023-38678
+
+### Impact
+
+Invalid `axis` and `dim_size` may cause `paddle.mode` segfault . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.mode(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)),
+    axis=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    keepdim=True
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-010_cn.md b/security/advisory/pdsa-2023-010_cn.md
new file mode 100644
index 00000000000000..f72cd8af856360
--- /dev/null
+++ b/security/advisory/pdsa-2023-010_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-010: Segfault in paddle.mode
+
+### CVE编号
+
+CVE-2023-38678
+
+### 影响
+
+接收异常的`axis`和`dim_size`可能会造成`paddle.mode`发生segfault，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.mode(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64)),
+    axis=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    keepdim=True
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-011.md b/security/advisory/pdsa-2023-011.md
new file mode 100644
index 00000000000000..da7985dede7d00
--- /dev/null
+++ b/security/advisory/pdsa-2023-011.md
@@ -0,0 +1,32 @@
+## PDSA-2023-011: Null pointer dereference in paddle.nextafter
+
+### CVE Number
+
+CVE-2023-52302
+
+### Impact
+
+Null pointer dereference in `paddle.nextafter` when tensor dims are invalid . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.nextafter(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 2]).astype(np.float32)),
+    y=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 0, 0, 0]).astype(np.float32))
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-011_cn.md b/security/advisory/pdsa-2023-011_cn.md
new file mode 100644
index 00000000000000..71440ac2c5d9a2
--- /dev/null
+++ b/security/advisory/pdsa-2023-011_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-011: Null pointer dereference in paddle.nextafter
+
+### CVE编号
+
+CVE-2023-52302
+
+### 影响
+
+输入张量的维度异常时，`paddle.nextafter`会引发空指针解引用，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.nextafter(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 2]).astype(np.float32)),
+    y=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 0, 0, 0]).astype(np.float32))
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-012.md b/security/advisory/pdsa-2023-012.md
new file mode 100644
index 00000000000000..f659d356154474
--- /dev/null
+++ b/security/advisory/pdsa-2023-012.md
@@ -0,0 +1,35 @@
+## PDSA-2023-012: Segfault in paddle.put_along_axis
+
+### CVE Number
+
+CVE-2023-52303
+
+### Impact
+
+Segfault in `paddle.put_along_axis` when tensor dims are invalid . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.put_along_axis(
+    arr=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [1]).astype(np.int32)),
+    indices=paddle.to_tensor(np.random.uniform(-9223372036854775808, 9223372036854775807, [1]).astype(np.int64)),
+    values=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    axis=0,
+    reduce="assign"
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-012_cn.md b/security/advisory/pdsa-2023-012_cn.md
new file mode 100644
index 00000000000000..234961cded2359
--- /dev/null
+++ b/security/advisory/pdsa-2023-012_cn.md
@@ -0,0 +1,35 @@
+## PDSA-2023-012: Segfault in paddle.put_along_axis
+
+### CVE编号
+
+CVE-2023-52303
+
+### 影响
+
+输入张量的维度异常时，`paddle.put_along_axis`会引发segfault，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.put_along_axis(
+    arr=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [1]).astype(np.int32)),
+    indices=paddle.to_tensor(np.random.uniform(-9223372036854775808, 9223372036854775807, [1]).astype(np.int64)),
+    values=paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32)),
+    axis=0,
+    reduce="assign"
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-013.md b/security/advisory/pdsa-2023-013.md
new file mode 100644
index 00000000000000..53deab6f3c346a
--- /dev/null
+++ b/security/advisory/pdsa-2023-013.md
@@ -0,0 +1,32 @@
+## PDSA-2023-013: Stack overflow in paddle.searchsorted
+
+### CVE Number
+
+CVE-2023-52304
+
+### Impact
+
+Invalid shapes cuase stack buffer overflow in `paddle.searchsorted`. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+sorted_sequence = paddle.to_tensor(np.array(0))
+values = paddle.to_tensor(np.random.uniform(-10, 10, []).astype(np.float64))
+
+paddle.searchsorted(sorted_sequence, values, out_int32=True, right=True)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-013_cn.md b/security/advisory/pdsa-2023-013_cn.md
new file mode 100644
index 00000000000000..c5210242f651fd
--- /dev/null
+++ b/security/advisory/pdsa-2023-013_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-013: Stack overflow in paddle.searchsorted
+
+### CVE编号
+
+CVE-2023-52304
+
+### 影响
+
+不正确的shapes会引发`paddle.searchsorted`栈溢出，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+sorted_sequence = paddle.to_tensor(np.array(0))
+values = paddle.to_tensor(np.random.uniform(-10, 10, []).astype(np.float64))
+
+paddle.searchsorted(sorted_sequence, values, out_int32=True, right=True)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-014.md b/security/advisory/pdsa-2023-014.md
new file mode 100644
index 00000000000000..1792f3b21e8fac
--- /dev/null
+++ b/security/advisory/pdsa-2023-014.md
@@ -0,0 +1,32 @@
+## PDSA-2023-014: FPE in paddle.topk
+
+### CVE Number
+
+CVE-2023-52305
+
+### Impact
+
+FPE in `paddle.topk` when `x` and `k` dims not correct. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [6, 2, 1, 4, 2, 0]).astype(np.float64))
+k = paddle.to_tensor(np.array(1).astype(np.int32))
+
+paddle.topk(x, k, axis=2,largest=False, sorted=True)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-014_cn.md b/security/advisory/pdsa-2023-014_cn.md
new file mode 100644
index 00000000000000..d1be63be148d21
--- /dev/null
+++ b/security/advisory/pdsa-2023-014_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-014: FPE in paddle.topk
+
+### CVE编号
+
+CVE-2023-52305
+
+### 影响
+
+当`x`和`k`的dims不符合要求时，可能导致`paddle.topk`除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [6, 2, 1, 4, 2, 0]).astype(np.float64))
+k = paddle.to_tensor(np.array(1).astype(np.int32))
+
+paddle.topk(x, k, axis=2,largest=False, sorted=True)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-015.md b/security/advisory/pdsa-2023-015.md
new file mode 100644
index 00000000000000..6830516e0505b6
--- /dev/null
+++ b/security/advisory/pdsa-2023-015.md
@@ -0,0 +1,33 @@
+## PDSA-2023-015: FPE in paddle.lerp
+
+### CVE Number
+
+CVE-2023-52306
+
+### Impact
+
+FPE in `paddle.lerp` when tensor shape is invalid. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 0, 0, 2, 6]).astype(np.float64))
+weight = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+
+paddle.lerp(x, y, weight)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-015_cn.md b/security/advisory/pdsa-2023-015_cn.md
new file mode 100644
index 00000000000000..7daa17bfff490b
--- /dev/null
+++ b/security/advisory/pdsa-2023-015_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-015: FPE in paddle.lerp
+
+### CVE编号
+
+CVE-2023-52306
+
+### 影响
+
+不合法的张量shape可能导致`paddle.lerp`除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+y = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 0, 0, 2, 6]).astype(np.float64))
+weight = paddle.to_tensor(np.random.uniform(-6666666, 100000000, []).astype(np.float64))
+
+paddle.lerp(x, y, weight)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-016.md b/security/advisory/pdsa-2023-016.md
new file mode 100644
index 00000000000000..2c6e93e3f87717
--- /dev/null
+++ b/security/advisory/pdsa-2023-016.md
@@ -0,0 +1,32 @@
+## PDSA-2023-016: Stack overflow in paddle.linalg.lu_unpack
+
+### CVE Number
+
+CVE-2023-52307
+
+### Impact
+
+Invalid shapes cuase stack buffer overflow in `paddle.linalg.lu_unpack`.  The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 6, 4, 8, 2]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32))
+
+paddle.linalg.lu_unpack(x, y, True, True)
+```
+
+### Patches
+
+We have patched the issue in commit [10093636a10f29f73f13729b33570d8cafd58fb6](https://github.com/PaddlePaddle/Paddle/pull/56311/commits/10093636a10f29f73f13729b33570d8cafd58fb6).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-016_cn.md b/security/advisory/pdsa-2023-016_cn.md
new file mode 100644
index 00000000000000..cdad03e02dce4a
--- /dev/null
+++ b/security/advisory/pdsa-2023-016_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-016: Stack overflow in paddle.linalg.lu_unpack
+
+### CVE编号
+
+CVE-2023-52307
+
+### 影响
+
+不正确的shapes会引发`paddle.linalg.lu_unpack`栈溢出，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [1, 6, 4, 8, 2]).astype(np.float32))
+y = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, []).astype(np.int32))
+
+paddle.linalg.lu_unpack(x, y, True, True)
+```
+
+### 补丁
+
+我们在commit [10093636a10f29f73f13729b33570d8cafd58fb6](https://github.com/PaddlePaddle/Paddle/pull/56311/commits/10093636a10f29f73f13729b33570d8cafd58fb6)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-017.md b/security/advisory/pdsa-2023-017.md
new file mode 100644
index 00000000000000..2d65947f7be858
--- /dev/null
+++ b/security/advisory/pdsa-2023-017.md
@@ -0,0 +1,33 @@
+## PDSA-2023-017: FPE in paddle.amin
+
+### CVE Number
+
+CVE-2023-52308
+
+### Impact
+
+FPE in `paddle.amin` when `x` has invalid dims. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+paddle.amin(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 6, 3]).astype(np.float32)),
+    axis=-1,
+    keepdim=True
+)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-017_cn.md b/security/advisory/pdsa-2023-017_cn.md
new file mode 100644
index 00000000000000..ac04896e1ffeb4
--- /dev/null
+++ b/security/advisory/pdsa-2023-017_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-017: FPE in paddle.amin
+
+### CVE编号
+
+CVE-2023-52308
+
+### 影响
+
+当`x` dims不符合要求时，可能导致`paddle.amin`除0异常，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+paddle.amin(
+    x=paddle.to_tensor(np.random.uniform(-6666666, 100000000, [0, 0, 6, 3]).astype(np.float32)),
+    axis=-1,
+    keepdim=True
+)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-018.md b/security/advisory/pdsa-2023-018.md
new file mode 100644
index 00000000000000..6dbec29738b2f8
--- /dev/null
+++ b/security/advisory/pdsa-2023-018.md
@@ -0,0 +1,32 @@
+## PDSA-2023-018: Heap buffer overflow in paddle.repeat_interleave
+
+### CVE Number
+
+CVE-2023-52309
+
+### Impact
+
+Heap buffer overflow in `paddle.repeat_interleave` by using invalid params. The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 4, 8, 3, 2, 4]).astype(np.float64))
+repeats = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [2, 1]).astype(np.int32))
+
+paddle.repeat_interleave(x, repeats, axis=-2)
+```
+
+### Patches
+
+We have patched the issue in commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of CAS-IIE.
diff --git a/security/advisory/pdsa-2023-018_cn.md b/security/advisory/pdsa-2023-018_cn.md
new file mode 100644
index 00000000000000..9680099b47d83c
--- /dev/null
+++ b/security/advisory/pdsa-2023-018_cn.md
@@ -0,0 +1,32 @@
+## PDSA-2023-018: Heap buffer overflow in paddle.repeat_interleave
+
+### CVE编号
+
+CVE-2023-52309
+
+### 影响
+
+非法的参数可能导致`paddle.repeat_interleave`堆溢出，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(-6666666, 100000000, [4, 4, 8, 3, 2, 4]).astype(np.float64))
+repeats = paddle.to_tensor(np.random.uniform(-2147483648, 2147483647, [2, 1]).astype(np.int32))
+
+paddle.repeat_interleave(x, repeats, axis=-2)
+```
+
+### 补丁
+
+我们在commit [19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc](https://github.com/PaddlePaddle/Paddle/commit/19da5c0c4d8c5e4dfef2a92e24141c3f51884dcc)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of CAS-IIE 提交。
diff --git a/security/advisory/pdsa-2023-019.md b/security/advisory/pdsa-2023-019.md
new file mode 100644
index 00000000000000..78a7b6b3230f5a
--- /dev/null
+++ b/security/advisory/pdsa-2023-019.md
@@ -0,0 +1,35 @@
+## PDSA-2023-019: Command injection in get_online_pass_interval
+
+### CVE Number
+
+CVE-2023-52310
+
+### Impact
+
+Command injection in `get_online_pass_interval` which could lead to execute arbitrary commands. The PoC is as follows:
+
+```python
+from paddle.incubate.distributed.fleet.fleet_util import FleetUtil
+
+fleet_util = FleetUtil()
+online_pass_interval = fleet_util.get_online_pass_interval(
+    days="{20190720..20190729}",
+    hours="9;touch /home/test/aaaa",
+    split_interval=5,
+    split_per_pass=2,
+    is_data_hourly_placed=False
+)
+```
+
+### Patches
+
+We have patched the issue in commits [1aae481dfd7d2055c801563e254f1484b974b68e](https://github.com/PaddlePaddle/Paddle/pull/60023/commits/1aae481dfd7d2055c801563e254f1484b974b68e), [c62d87eb91c84154af40946f17205d86f608866b](https://github.com/PaddlePaddle/Paddle/pull/60544/commits/c62d87eb91c84154af40946f17205d86f608866b) and [f8560c903c80450e37b8f304a9cd8207678f2f83](https://github.com/PaddlePaddle/Paddle/pull/60615/commits/f8560c903c80450e37b8f304a9cd8207678f2f83).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by huntr.com and leeya_bug.
diff --git a/security/advisory/pdsa-2023-019_cn.md b/security/advisory/pdsa-2023-019_cn.md
new file mode 100644
index 00000000000000..096d4c191ebc2b
--- /dev/null
+++ b/security/advisory/pdsa-2023-019_cn.md
@@ -0,0 +1,35 @@
+## PDSA-2023-019: Command injection in get_online_pass_interval
+
+### CVE编号
+
+CVE-2023-52310
+
+### 影响
+
+`get_online_pass_interval`存在命令注入漏洞，可造成任意命令执行，PoC代码如下：
+
+```python
+from paddle.incubate.distributed.fleet.fleet_util import FleetUtil
+
+fleet_util = FleetUtil()
+online_pass_interval = fleet_util.get_online_pass_interval(
+    days="{20190720..20190729}",
+    hours="9;touch /home/test/aaaa",
+    split_interval=5,
+    split_per_pass=2,
+    is_data_hourly_placed=False
+)
+```
+
+### 补丁
+
+我们在commits [1aae481dfd7d2055c801563e254f1484b974b68e](https://github.com/PaddlePaddle/Paddle/pull/60023/commits/1aae481dfd7d2055c801563e254f1484b974b68e)、[c62d87eb91c84154af40946f17205d86f608866b](https://github.com/PaddlePaddle/Paddle/pull/60544/commits/c62d87eb91c84154af40946f17205d86f608866b) 和 [f8560c903c80450e37b8f304a9cd8207678f2f83](https://github.com/PaddlePaddle/Paddle/pull/60615/commits/f8560c903c80450e37b8f304a9cd8207678f2f83) 中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 huntr.com 和 leeya_bug 提交。
diff --git a/security/advisory/pdsa-2023-020.md b/security/advisory/pdsa-2023-020.md
new file mode 100644
index 00000000000000..ed3a5966d6ca60
--- /dev/null
+++ b/security/advisory/pdsa-2023-020.md
@@ -0,0 +1,28 @@
+## PDSA-2023-020: Command injection in _wget_download
+
+### CVE Number
+
+CVE-2023-52311
+
+### Impact
+
+Command injection in `_wget_download` which could lead to execute arbitrary commands. The PoC is as follows:
+
+```python
+from paddle import utils
+
+utils.download._wget_download("aa; touch codexecution", "bb")
+```
+
+### Patches
+
+We have patched the issue in commit [d5550d3f2f5bab48c783b4986ba1cd8e061ce542](https://github.com/PaddlePaddle/Paddle/pull/59957/commits/d5550d3f2f5bab48c783b4986ba1cd8e061ce542).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by huntr.com.
diff --git a/security/advisory/pdsa-2023-020_cn.md b/security/advisory/pdsa-2023-020_cn.md
new file mode 100644
index 00000000000000..a6bd1321592e62
--- /dev/null
+++ b/security/advisory/pdsa-2023-020_cn.md
@@ -0,0 +1,28 @@
+## PDSA-2023-020: Command injection in _wget_download
+
+### CVE编号
+
+CVE-2023-52311
+
+### 影响
+
+`_wget_download`存在命令注入漏洞，可造成任意命令执行，PoC代码如下：
+
+```python
+from paddle import utils
+
+utils.download._wget_download("aa; touch codexecution", "bb")
+```
+
+### 补丁
+
+我们在commit [d5550d3f2f5bab48c783b4986ba1cd8e061ce542](https://github.com/PaddlePaddle/Paddle/pull/59957/commits/d5550d3f2f5bab48c783b4986ba1cd8e061ce542)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 huntr.com 提交。
diff --git a/security/advisory/pdsa-2023-021.md b/security/advisory/pdsa-2023-021.md
new file mode 100644
index 00000000000000..6a8ec45b33e23c
--- /dev/null
+++ b/security/advisory/pdsa-2023-021.md
@@ -0,0 +1,33 @@
+## PDSA-2023-021: Null pointer dereference in paddle.crop
+
+### CVE Number
+
+CVE-2023-52312
+
+### Impact
+
+Null pointer dereference in `paddle.crop` when tensor dims are invalid . The PoC is as follows:
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(0, 10, [2, 2]).astype(np.int32))
+shape = paddle.to_tensor([-1, 0], dtype='int32')
+offsets = paddle.to_tensor([], dtype='int32')
+
+out = paddle.crop(x, shape, offsets)
+```
+
+### Patches
+
+We have patched the issue in commit [c074de6911944d5d30d28cc7ce2c7099f1c87bce](https://github.com/PaddlePaddle/Paddle/pull/59967/commits/c074de6911944d5d30d28cc7ce2c7099f1c87bce).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Peng Zhou (zpbrent) from Shanghai University.
diff --git a/security/advisory/pdsa-2023-021_cn.md b/security/advisory/pdsa-2023-021_cn.md
new file mode 100644
index 00000000000000..eff0b0c2225aac
--- /dev/null
+++ b/security/advisory/pdsa-2023-021_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2023-021: Null pointer dereference in paddle.crop
+
+### CVE编号
+
+CVE-2023-52312
+
+### 影响
+
+输入张量的维度异常时，`paddle.crop`会引发空指针解引用，PoC代码如下：
+
+```python
+import paddle
+import numpy as np
+
+x = paddle.to_tensor(np.random.uniform(0, 10, [2, 2]).astype(np.int32))
+shape = paddle.to_tensor([-1, 0], dtype='int32')
+offsets = paddle.to_tensor([], dtype='int32')
+
+out = paddle.crop(x, shape, offsets)
+```
+
+### 补丁
+
+我们在commit [c074de6911944d5d30d28cc7ce2c7099f1c87bce](https://github.com/PaddlePaddle/Paddle/pull/59967/commits/c074de6911944d5d30d28cc7ce2c7099f1c87bce)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Peng Zhou (zpbrent) from Shanghai University 提交。
diff --git a/security/advisory/pdsa-2023-022.md b/security/advisory/pdsa-2023-022.md
new file mode 100644
index 00000000000000..b5b3b3519c9c0e
--- /dev/null
+++ b/security/advisory/pdsa-2023-022.md
@@ -0,0 +1,30 @@
+## PDSA-2023-022: FPE in paddle.argmin and paddle.argmax
+
+### CVE Number
+
+CVE-2023-52313
+
+### Impact
+
+FPE in `paddle.argmin` and `paddle.argmax` when input `x.numel()` is 0. The PoC is as follows:
+
+```python
+import paddle
+
+data = paddle.to_tensor([], dtype="int32")
+
+paddle.argmax(data, axis=0)
+```
+
+### Patches
+
+We have patched the issue in commit [41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794](https://github.com/PaddlePaddle/Paddle/pull/59976/commits/41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Peng Zhou (zpbrent) from Shanghai University.
diff --git a/security/advisory/pdsa-2023-022_cn.md b/security/advisory/pdsa-2023-022_cn.md
new file mode 100644
index 00000000000000..d7c57f94394955
--- /dev/null
+++ b/security/advisory/pdsa-2023-022_cn.md
@@ -0,0 +1,30 @@
+## PDSA-2023-022: FPE in paddle.argmin and paddle.argmax
+
+### CVE编号
+
+CVE-2023-52313
+
+### 影响
+
+输入`x.numel()`为0时`paddle.argmin`和`paddle.argmax`会引发除0异常，PoC代码如下：
+
+```python
+import paddle
+
+data = paddle.to_tensor([], dtype="int32")
+
+paddle.argmax(data, axis=0)
+```
+
+### 补丁
+
+我们在commit [41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794](https://github.com/PaddlePaddle/Paddle/pull/59976/commits/41eda9080b12e6f1b3a49cdc8439a1b9f1ed6794)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Peng Zhou (zpbrent) from Shanghai University 提交。
diff --git a/security/advisory/pdsa-2023-023.md b/security/advisory/pdsa-2023-023.md
new file mode 100644
index 00000000000000..c2671f7f87adca
--- /dev/null
+++ b/security/advisory/pdsa-2023-023.md
@@ -0,0 +1,28 @@
+## PDSA-2023-023: Command injection in convert_shape_compare
+
+### CVE Number
+
+CVE-2023-52314
+
+### Impact
+
+Command injection in `convert_shape_compare` which could lead to execute arbitrary commands. The PoC is as follows:
+
+```python
+import paddle
+
+paddle.jit.dy2static.convert_operators.convert_shape_compare('prefix','+ str(__import__("os").system("cat /etc/passwd")) +','1')
+```
+
+### Patches
+
+We have patched the issue in commit [c3b6414eb313480f1417abe92d410dfe89723097](https://github.com/PaddlePaddle/Paddle/pull/60097/commits/c3b6414eb313480f1417abe92d410dfe89723097).
+The fix will be included in PaddlePaddle 2.6.0.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by leeya_bug.
diff --git a/security/advisory/pdsa-2023-023_cn.md b/security/advisory/pdsa-2023-023_cn.md
new file mode 100644
index 00000000000000..3de87a4d707674
--- /dev/null
+++ b/security/advisory/pdsa-2023-023_cn.md
@@ -0,0 +1,28 @@
+## PDSA-2023-023: Command injection in convert_shape_compare
+
+### CVE编号
+
+CVE-2023-52314
+
+### 影响
+
+`convert_shape_compare`存在命令注入漏洞，可造成任意命令执行，PoC代码如下：
+
+```python
+import paddle
+
+paddle.jit.dy2static.convert_operators.convert_shape_compare('prefix','+ str(__import__("os").system("cat /etc/passwd")) +','1')
+```
+
+### 补丁
+
+我们在commit [c3b6414eb313480f1417abe92d410dfe89723097](https://github.com/PaddlePaddle/Paddle/pull/60097/commits/c3b6414eb313480f1417abe92d410dfe89723097)中对此问题进行了补丁。
+修复将包含在飞桨2.6.0版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 leeya_bug 提交。
diff --git a/test/auto_parallel/hybrid_strategy/CMakeLists.txt b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
index 8d4c34745d8238..4d15ae4b30d922 100644
--- a/test/auto_parallel/hybrid_strategy/CMakeLists.txt
+++ b/test/auto_parallel/hybrid_strategy/CMakeLists.txt
@@ -10,7 +10,7 @@ if((WITH_GPU) AND (LINUX))
     test_semi_auto_parallel_hybrid_strategy ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_semi_auto_parallel_hybrid_strategy
-                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=HYBRID")
+                       PROPERTIES TIMEOUT "600" LABELS "RUN_TYPE=HYBRID")
 endif()
 if((WITH_GPU) AND (LINUX))
   py_test_modules(
diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt
index b7c44f8372a95c..5a0e2c0d859ec1 100644
--- a/test/collective/fleet/CMakeLists.txt
+++ b/test/collective/fleet/CMakeLists.txt
@@ -831,7 +831,7 @@ if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
     ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
 endif()
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   if(WITH_DGC)
     if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
       bash_test_modules(
@@ -846,7 +846,7 @@ if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
     endif()
   endif()
 endif()
-if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
+if(WITH_NCCL OR WITH_RCCL)
   if(WITH_DGC)
     if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
       bash_test_modules(
diff --git a/test/collective/fleet/run_server_for_communicator_half_async.py b/test/collective/fleet/run_server_for_communicator_half_async.py
new file mode 100644
index 00000000000000..14d8fd80331b35
--- /dev/null
+++ b/test/collective/fleet/run_server_for_communicator_half_async.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
+
+import paddle
+
+paddle.enable_static()
+
+pipe_name = os.getenv("PIPE_FILE")
+
+
+class RunServer(TestCommunicatorHalfAsyncEnd2End):
+    def runTest(self):
+        pass
+
+
+os.environ["TRAINING_ROLE"] = "PSERVER"
+os.environ["http_proxy"] = ""
+os.environ["https_proxy"] = ""
+half_run_server = RunServer()
+with open(pipe_name, 'w') as pipe:
+    pipe.write('done')
+
+half_run_server.run_ut()
diff --git a/test/collective/fleet/test_communicator_half_async.py b/test/collective/fleet/test_communicator_half_async.py
index 25e5302fb444fd..687337f25ab2ae 100644
--- a/test/collective/fleet/test_communicator_half_async.py
+++ b/test/collective/fleet/test_communicator_half_async.py
@@ -15,6 +15,7 @@
 import os
 import subprocess
 import sys
+import tempfile
 import unittest
 
 import numpy
@@ -23,6 +24,7 @@
 from paddle import base
 from paddle.distributed import fleet
 from paddle.distributed.fleet.base import role_maker
+from paddle.distributed.utils.launch_utils import find_free_ports
 
 paddle.enable_static()
 
@@ -30,25 +32,44 @@
 class TestCommunicatorHalfAsyncEnd2End(unittest.TestCase):
     def net(self):
         x = paddle.static.data(name='x', shape=[-1, 13], dtype='float32')
-        y_predict = paddle.static.nn.fc(x, size=1, activation=None)
-        y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
+        x1 = paddle.static.data(
+            name='x1', shape=[-1, 1], dtype='int64', lod_level=1
+        )
 
+        emb = paddle.static.nn.embedding(
+            input=x1,
+            size=[10000, 10],
+            param_attr=base.ParamAttr(
+                name="embedding",
+                initializer=paddle.nn.initializer.Constant(value=0.01),
+            ),
+            is_sparse=True,
+        )
+
+        pool = paddle.static.nn.sequence_lod.sequence_pool(
+            input=emb.squeeze(-2), pool_type="sum"
+        )
+        z = paddle.concat([x, pool], axis=1)
+
+        y_predict = paddle.static.nn.fc(x=z, size=1)
+        y = paddle.static.data(name='y', shape=[-1, 1], dtype='float32')
         cost = paddle.nn.functional.square_error_cost(input=y_predict, label=y)
         avg_cost = paddle.mean(cost)
-        return avg_cost, x, y
+        return avg_cost, x, x1, y
 
     def fake_reader(self):
         def reader():
             for i in range(10000):
                 x = numpy.random.random((1, 13)).astype('float32')
+                z = numpy.random.randint(0, 9999, (1, 1)).astype('int64')
                 y = numpy.random.randint(0, 2, (1, 1)).astype('int64')
-                yield x, y
+                yield x, z, y
 
         return reader
 
     def run_pserver(self, role, strategy):
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = paddle.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
@@ -61,20 +82,20 @@ def run_trainer(self, role, strategy):
         exe = base.Executor(place)
 
         fleet.init(role)
-        avg_cost, x, y = self.net()
+        avg_cost, x, z, y = self.net()
         optimizer = paddle.optimizer.SGD(0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy)
         optimizer.minimize(avg_cost)
 
-        exe.run(paddle.static.default_startup_program())
+        exe.run(base.default_startup_program())
         fleet.init_worker()
 
         train_reader = paddle.batch(self.fake_reader(), batch_size=24)
-        feeder = base.DataFeeder(place=place, feed_list=[x, y])
+        feeder = base.DataFeeder(place=place, feed_list=[x, z, y])
 
         for batch_id, data in enumerate(train_reader()):
             exe.run(
-                paddle.static.default_main_program(),
+                base.default_main_program(),
                 feed=feeder.feed(data),
                 fetch_list=[],
             )
@@ -82,19 +103,18 @@ def run_trainer(self, role, strategy):
         fleet.stop_worker()
 
     def run_ut(self):
-        strategy = paddle.distributed.fleet.DistributedStrategy()
-        strategy.a_sync = True
-
         training_role = os.getenv("TRAINING_ROLE", "TRAINER")
 
-        role = role_maker.UserDefinedRoleMaker(
-            current_id=0,
-            role=role_maker.Role.WORKER
-            if training_role == "TRAINER"
-            else role_maker.Role.SERVER,
-            worker_num=1,
-            server_endpoints=["127.0.0.1:6002"],
-        )
+        os.environ["PADDLE_PSERVER_NUMS"] = "1"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["PADDLE_TRAINER_ID"] = "0"
+        os.environ["PADDLE_TRAINERS_NUM"] = "1"
+        os.environ["POD_IP"] = "127.0.0.1"
+
+        role = role_maker.PaddleCloudRoleMaker()
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.a_sync = True
 
         if training_role == "TRAINER":
             self.run_trainer(role, strategy)
@@ -102,61 +122,39 @@ def run_ut(self):
             self.run_pserver(role, strategy)
 
     def test_communicator(self):
-        run_server_cmd = """
+        temp_dir = tempfile.TemporaryDirectory()
+        pipe_name = os.path.join(temp_dir.name, 'mypipe')
+        try:
+            os.mkfifo(pipe_name)
+        except OSError as oe:
+            print(f"Failed to create pipe: {oe}")
 
-import sys
-import os
+        port = find_free_ports(1).pop()
 
-import time
-import threading
-import subprocess
-import unittest
-import numpy
-
-from test_communicator_half_async import TestCommunicatorHalfAsyncEnd2End
-
-import paddle
-import paddle.base as base
-import paddle.distributed.fleet as fleet
-import paddle.distributed.fleet.base.role_maker as role_maker
-
-paddle.enable_static()
-
-class RunServer(TestCommunicatorHalfAsyncEnd2End):
-    def runTest(self):
-        pass
-
-os.environ["http_proxy"] = ""
-os.environ["https_proxy"] = ""
-os.environ["TRAINING_ROLE"] = "PSERVER"
-half_run_server = RunServer()
-half_run_server.run_ut()
-"""
-
-        server_file = "run_server_for_communicator_haflaysnc.py"
-        with open(server_file, "w") as wb:
-            wb.write(run_server_cmd)
         os.environ["TRAINING_ROLE"] = "PSERVER"
-        _python = sys.executable
+        os.environ["PADDLE_PORT"] = str(port)
+        os.environ["PADDLE_PSERVERS_IP_PORT_LIST"] = f"127.0.0.1:{port}"
+        os.environ["PIPE_FILE"] = pipe_name
 
+        _python = sys.executable
+        server_file = "run_server_for_communicator_half_async.py"
         ps_cmd = f"{_python} {server_file}"
+
         ps_proc = subprocess.Popen(
             ps_cmd.strip().split(" "),
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
         )
 
-        os.environ["http_proxy"] = ""
-        os.environ["https_proxy"] = ""
+        with open(pipe_name, 'r') as pipe:
+            start_command = pipe.read()
+
         os.environ["TRAINING_ROLE"] = "TRAINER"
-        os.environ["FLAGS_communicator_send_queue_size"] = "1"
-        os.environ["FLAGS_communicator_max_merge_var_num"] = "1"
 
         self.run_ut()
         ps_proc.kill()
-
-        if os.path.exists(server_file):
-            os.remove(server_file)
+        ps_proc.wait()
+        outs, errs = ps_proc.communicate()
 
 
 if __name__ == '__main__':
diff --git a/test/collective/fleet/test_dygraph_sharding_stage2.py b/test/collective/fleet/test_dygraph_sharding_stage2.py
index 49600c497371e7..39eeaa177cf17d 100644
--- a/test/collective/fleet/test_dygraph_sharding_stage2.py
+++ b/test/collective/fleet/test_dygraph_sharding_stage2.py
@@ -13,21 +13,20 @@
 # limitations under the License.
 
 import unittest
-import sys
-sys.path.append("/workspace/Paddle/test")
+
 from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus
 
 
 class TestDygraphShardingStage2(TestMultipleGpus):
     # check sharding logic as well as the accuracy with single mode
     def test_dygraph_sharding_stage2(self):
-        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/dygraph_group_sharded_stage2.py')
+        self.run_mnist_2gpu('dygraph_group_sharded_stage2.py')
 
     def test_dygraph_sharding_stage2_offload(self):
-        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/dygraph_group_sharded_stage2_offload.py')
+        self.run_mnist_2gpu('dygraph_group_sharded_stage2_offload.py')
 
     def test_dygraph_sharding_stage2_with_comm_overlap(self):
-        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py')
+        self.run_mnist_2gpu('dygraph_group_sharded_stage2_comm_overlap.py')
 
 
 if __name__ == "__main__":
diff --git a/test/collective/fleet/test_parallel_dygraph_mp_layers.py b/test/collective/fleet/test_parallel_dygraph_mp_layers.py
index 81d0c06c0f8a0b..6dc84b9dcb0e67 100644
--- a/test/collective/fleet/test_parallel_dygraph_mp_layers.py
+++ b/test/collective/fleet/test_parallel_dygraph_mp_layers.py
@@ -13,14 +13,13 @@
 # limitations under the License.
 
 import unittest
-import sys
-sys.path.append("/workspace/Paddle/test")
+
 from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus
 
 
 class TestModelParallelLayer(TestMultipleGpus):
     def test_hybrid_parallel_mp_layer(self):
-        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/hybrid_parallel_mp_layers.py')
+        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
 
 
 if __name__ == "__main__":
diff --git a/test/collective/fleet/test_parallel_dygraph_qat.py b/test/collective/fleet/test_parallel_dygraph_qat.py
index d3e204f437f398..a5b736ce5b2917 100644
--- a/test/collective/fleet/test_parallel_dygraph_qat.py
+++ b/test/collective/fleet/test_parallel_dygraph_qat.py
@@ -61,7 +61,7 @@ def start_local_trainers(
     log_dir=None,
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/cpp/fluid/CMakeLists.txt b/test/cpp/fluid/CMakeLists.txt
index 59ed51f7681685..e811e547511a84 100644
--- a/test/cpp/fluid/CMakeLists.txt
+++ b/test/cpp/fluid/CMakeLists.txt
@@ -1,8 +1,6 @@
 add_subdirectory(benchmark)
 add_subdirectory(framework)
 
-add_subdirectory(inference)
-
 if(WITH_CINN)
   add_subdirectory(cinn)
 endif()
diff --git a/test/cpp/fluid/inference/CMakeLists.txt b/test/cpp/fluid/inference/CMakeLists.txt
deleted file mode 100644
index 512d2b1553c8c9..00000000000000
--- a/test/cpp/fluid/inference/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-add_subdirectory(utils)
diff --git a/test/cpp/fluid/inference/utils/CMakeLists.txt b/test/cpp/fluid/inference/utils/CMakeLists.txt
deleted file mode 100644
index 3ea72839b19243..00000000000000
--- a/test/cpp/fluid/inference/utils/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-if(WITH_TESTING)
-    if(NOT APPLE)
-      inference_base_test(
-            infer_io_utils_tester SRCS io_utils_tester.cc 
-            DEPS
-            paddle_inference_shared
-            common
-      )
-      endif()
-endif()
-
-if(WITH_ONNXRUNTIME AND WIN32)
-    # Copy onnxruntime for some c++ test in Windows, since the test will
-    # be build only in CI, so suppose the generator in Windows is Ninja.
-    copy_onnx(infer_io_utils_tester)
-endif()
diff --git a/test/cpp/fluid/inference/utils/io_utils_tester.cc b/test/cpp/fluid/inference/utils/io_utils_tester.cc
deleted file mode 100644
index 756027fb6cb9bd..00000000000000
--- a/test/cpp/fluid/inference/utils/io_utils_tester.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-
-#include <utility>
-
-#include "paddle/fluid/inference/api/helper.h"
-#include "paddle/fluid/inference/utils/io_utils.h"
-
-namespace paddle {
-namespace inference {
-namespace {
-
-bool pd_tensor_equal(const paddle::PaddleTensor& ref,
-                     const paddle::PaddleTensor& t) {
-  bool is_equal = true;
-  VLOG(3) << "ref.name: " << ref.name << ", t.name: " << t.name;
-  VLOG(3) << "ref.dtype: " << ref.dtype << ", t.dtype: " << t.dtype;
-  VLOG(3) << "ref.lod_level: " << ref.lod.size()
-          << ", t.dtype: " << t.lod.size();
-  VLOG(3) << "ref.data_len: " << ref.data.length()
-          << ", t.data_len: " << t.data.length();
-  return is_equal && (ref.name == t.name) && (ref.lod == t.lod) &&
-         (ref.dtype == t.dtype) &&
-         (std::memcmp(ref.data.data(), t.data.data(), ref.data.length()) == 0);
-}
-
-template <typename T>
-void test_io_utils() {
-  std::vector<T> input({6, 8});
-  paddle::PaddleTensor in;
-  in.name = "Hello";
-  in.shape = {1, 2};
-  in.lod = std::vector<std::vector<size_t>>{{0, 1}};
-  in.data = paddle::PaddleBuf(static_cast<void*>(input.data()),
-                              input.size() * sizeof(T));
-  in.dtype = paddle::inference::PaddleTensorGetDType<T>();
-  std::stringstream ss;
-  paddle::inference::SerializePDTensorToStream(&ss, in);
-  paddle::PaddleTensor out;
-  paddle::inference::DeserializePDTensorToStream(ss, &out);
-  ASSERT_TRUE(pd_tensor_equal(in, out));
-}
-}  // namespace
-}  // namespace inference
-}  // namespace paddle
-
-TEST(infer_io_utils, float32) { paddle::inference::test_io_utils<float>(); }
-
-TEST(infer_io_utils, tensors) {
-  // Create a float32 tensor.
-  std::vector<float> input_fp32({1.1f, 3.2f, 5.0f, 8.2f});
-  paddle::PaddleTensor in_fp32;
-  in_fp32.name = "Tensor.fp32_0";
-  in_fp32.shape = {2, 2};
-  in_fp32.data = paddle::PaddleBuf(static_cast<void*>(input_fp32.data()),
-                                   input_fp32.size() * sizeof(float));
-  in_fp32.dtype = paddle::inference::PaddleTensorGetDType<float>();
-
-  // Create a int64 tensor.
-  std::vector<float> input_int64({5, 8});
-  paddle::PaddleTensor in_int64;
-  in_int64.name = "Tensor.int64_0";
-  in_int64.shape = {1, 2};
-  in_int64.lod = std::vector<std::vector<size_t>>{{0, 1}};
-  in_int64.data = paddle::PaddleBuf(static_cast<void*>(input_int64.data()),
-                                    input_int64.size() * sizeof(int64_t));
-  in_int64.dtype = paddle::inference::PaddleTensorGetDType<int64_t>();
-
-  // Serialize tensors.
-  std::vector<paddle::PaddleTensor> tensors_in({in_fp32});
-  std::string file_path = "./io_utils_tensors";
-  paddle::inference::SerializePDTensorsToFile(file_path, tensors_in);
-
-  // Deserialize tensors.
-  std::vector<paddle::PaddleTensor> tensors_out;
-  paddle::inference::DeserializePDTensorsToFile(file_path, &tensors_out);
-
-  // Check results.
-  ASSERT_EQ(tensors_in.size(), tensors_out.size());
-  for (size_t i = 0; i < tensors_in.size(); ++i) {
-    ASSERT_TRUE(
-        paddle::inference::pd_tensor_equal(tensors_in[i], tensors_out[i]));
-  }
-}
-
-TEST(shape_info_io, read_and_write) {
-  const std::string path = "test_shape_info_io";
-  std::map<std::string, std::vector<int32_t>> min_shape, max_shape, opt_shape;
-  std::map<std::string, std::vector<int32_t>> min_value, max_value, opt_value;
-  min_shape.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
-  max_shape.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  opt_shape.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  min_value.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
-  max_value.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  opt_value.insert(
-      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
-  paddle::inference::SerializeShapeRangeInfo(
-      path, min_shape, max_shape, opt_shape, min_value, max_value, opt_value);
-  min_shape.clear();
-  max_shape.clear();
-  opt_shape.clear();
-  min_value.clear();
-  max_value.clear();
-  opt_value.clear();
-  opt_shape.insert(
-      std::make_pair("test2", std::vector<int32_t>{1, 3, 224, 224}));
-  paddle::inference::DeserializeShapeRangeInfo(path,
-                                               &min_shape,
-                                               &max_shape,
-                                               &opt_shape,
-                                               &min_value,
-                                               &max_value,
-                                               &opt_value);
-
-  min_shape.insert(std::make_pair("test1", std::vector<int32_t>{1, 3, 56, 56}));
-  std::vector<std::string> names{"test1"};
-  paddle::inference::UpdateShapeRangeInfo(path,
-                                          min_shape,
-                                          max_shape,
-                                          opt_shape,
-                                          min_value,
-                                          max_value,
-                                          opt_value,
-                                          names,
-                                          names);
-
-  ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo("no_exists_file",
-                                                            &min_shape,
-                                                            &max_shape,
-                                                            &opt_shape,
-                                                            &min_value,
-                                                            &max_value,
-                                                            &opt_value);
-               , paddle::platform::EnforceNotMet);
-}
diff --git a/test/cpp/fluid/nccl/CMakeLists.txt b/test/cpp/fluid/nccl/CMakeLists.txt
index 0df57b31f16db2..a8bd7b7f556345 100644
--- a/test/cpp/fluid/nccl/CMakeLists.txt
+++ b/test/cpp/fluid/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT (WITH_NCCL OR WITH_RCCL  OR WITH_MCCL))
+if(NOT (WITH_NCCL OR WITH_RCCL))
   return()
 endif()
 
diff --git a/test/cpp/fluid/nccl/nccl_op_test.cu.cc b/test/cpp/fluid/nccl/nccl_op_test.cu.cc
index 5245417ddc40db..b8a47b97031653 100644
--- a/test/cpp/fluid/nccl/nccl_op_test.cu.cc
+++ b/test/cpp/fluid/nccl/nccl_op_test.cu.cc
@@ -31,12 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
-USE_OP_ITSELF(mcclAllReduce);
+USE_OP_ITSELF(ncclAllReduce);
 USE_OP_ITSELF(ncclReduce);
-USE_OP_ITSELF(mcclBcast);
-PD_DECLARE_KERNEL(mcclAllReduce, GPU, ALL_LAYOUT);
+USE_OP_ITSELF(ncclBcast);
+PD_DECLARE_KERNEL(ncclAllReduce, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(ncclReduce, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(mcclBcast, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(ncclBcast, GPU, ALL_LAYOUT);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
@@ -136,7 +136,7 @@ class NCCLTester : public ::testing::Test {
 
 void NCCLTester::testNcclAllReduceOp() {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  op2->SetType("mcclAllReduce");
+  op2->SetType("ncclAllReduce");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
@@ -249,7 +249,7 @@ void NCCLTester::testNcclReduceOp() {
 void NCCLTester::testNcclBcastOp() {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
-  op2->SetType("mcclBcast");
+  op2->SetType("ncclBcast");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
diff --git a/test/cpp/imperative/CMakeLists.txt b/test/cpp/imperative/CMakeLists.txt
index 372d4fe72fd31f..491a008a963283 100644
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WIN32)
     SRCS nccl_context_test.cc
     DEPS device_context)
 else()
-  if(WITH_GLOO AND (WITH_NCCL OR WITH_RCCL OR WITH_MCCL))
+  if(WITH_GLOO AND (WITH_NCCL OR WITH_RCCL))
     cc_test(
       nccl_context_test
       SRCS nccl_context_test.cc
@@ -85,7 +85,6 @@ cc_test(
   DEPS tracer layer prepared_operator mul_op)
 if(WITH_NCCL
    OR WITH_RCCL
-   OR WITH_MCCL
    OR WITH_XPU_BKCL)
   cc_test(
     test_group
diff --git a/test/cpp/imperative/nccl_context_test.cc b/test/cpp/imperative/nccl_context_test.cc
index 55ccc9f6391840..8b9958ee561824 100644
--- a/test/cpp/imperative/nccl_context_test.cc
+++ b/test/cpp/imperative/nccl_context_test.cc
@@ -38,7 +38,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
 }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-void BcastNCCLId(int local_rank, std::vector<mcclUniqueId>* nccl_ids) {
+void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) {
   auto strategy = GetStrategy(local_rank);
   int server_fd = platform::CreateListenSocket(strategy.current_endpoint_);
 
@@ -50,18 +50,18 @@ void BcastNCCLId(int local_rank, std::vector<mcclUniqueId>* nccl_ids) {
 }
 
 TEST(BcastNCCLId, Run) {
-  std::vector<mcclUniqueId> nccl_ids;
+  std::vector<ncclUniqueId> nccl_ids;
   nccl_ids.resize(nrings);
   for (int i = 0; i < nrings; ++i) {
-    platform::dynload::mcclGetUniqueId(&nccl_ids[i]);
+    platform::dynload::ncclGetUniqueId(&nccl_ids[i]);
   }
 
   std::thread t(BcastNCCLId, 0, &nccl_ids);
 
-  std::vector<mcclUniqueId> recv_nccl_ids;
+  std::vector<ncclUniqueId> recv_nccl_ids;
   recv_nccl_ids.resize(nrings);
   for (int i = 0; i < nrings; ++i) {
-    platform::dynload::mcclGetUniqueId(&recv_nccl_ids[i]);
+    platform::dynload::ncclGetUniqueId(&recv_nccl_ids[i]);
   }
   BcastNCCLId(1, &recv_nccl_ids);
 
diff --git a/test/cpp/inference/api/tester_helper.h b/test/cpp/inference/api/tester_helper.h
index a410df859fe450..a5d60ca6eec974 100644
--- a/test/cpp/inference/api/tester_helper.h
+++ b/test/cpp/inference/api/tester_helper.h
@@ -34,7 +34,6 @@
 #include "paddle/fluid/inference/api/analysis_predictor.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
-#include "paddle/fluid/inference/utils/benchmark.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "test/cpp/inference/api/config_printer.h"
 #include "test/cpp/inference/test_helper.h"
@@ -69,9 +68,6 @@ PD_DEFINE_int32(num_threads,
 PD_DEFINE_bool(use_analysis,
                true,
                "Running the inference program in analysis mode.");
-PD_DEFINE_bool(record_benchmark,
-               false,
-               "Record benchmark after profiling the model");
 PD_DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
 PD_DEFINE_double(quantized_accuracy, 2e-2, "Result Quantized Accuracy.");
 PD_DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
@@ -594,14 +590,6 @@ void PredictionRun(PaddlePredictor *predictor,
 
   if (sample_latency != nullptr)
     *sample_latency = batch_latency / FLAGS_batch_size;
-
-  if (FLAGS_record_benchmark) {
-    Benchmark benchmark;
-    benchmark.SetName(FLAGS_model_name);
-    benchmark.SetBatchSize(FLAGS_batch_size);
-    benchmark.SetLatency(batch_latency);
-    benchmark.PersistToFile("benchmark_record.txt");
-  }
 }
 
 void TestOneThreadPrediction(
diff --git a/test/cpp/inference/api/trt_dynamic_shape_test.cc b/test/cpp/inference/api/trt_dynamic_shape_test.cc
index 80929f10447b83..52336e7e8a5412 100644
--- a/test/cpp/inference/api/trt_dynamic_shape_test.cc
+++ b/test/cpp/inference/api/trt_dynamic_shape_test.cc
@@ -191,6 +191,7 @@ void TestTunedDynamic() {
     output_t->copy_to_cpu(out_data.data());
   };
   check_func(predictor_tuned.get());
+  predictor_tuned.reset(nullptr);
 
   // check tuned_dynamic_shape
   AnalysisConfig config;
diff --git a/test/cpp/inference/test.cmake b/test/cpp/inference/test.cmake
index 33961a949369c5..7d3fb889e0e727 100644
--- a/test/cpp/inference/test.cmake
+++ b/test/cpp/inference/test.cmake
@@ -111,10 +111,9 @@ function(inference_base_test_build TARGET)
   add_executable(${TARGET} ${base_test_SRCS})
   if("${base_test_DEPS};" MATCHES "paddle_inference_shared;")
     list(REMOVE_ITEM base_test_DEPS paddle_inference_shared)
-    target_link_libraries(
-      ${TARGET} $<TARGET_LINKER_FILE:paddle_inference_shared>
-      $<TARGET_LINKER_FILE:benchmark>)
-    add_dependencies(${TARGET} paddle_inference_shared benchmark)
+    target_link_libraries(${TARGET}
+                          $<TARGET_LINKER_FILE:paddle_inference_shared>)
+    add_dependencies(${TARGET} paddle_inference_shared)
   elseif("${base_test_DEPS};" MATCHES "paddle_inference_c_shared;")
     list(REMOVE_ITEM base_test_DEPS paddle_inference_c_shared)
     target_link_libraries(${TARGET}
diff --git a/test/custom_runtime/CMakeLists.txt b/test/custom_runtime/CMakeLists.txt
index e8b14445278be8..cf11b5555c3860 100644
--- a/test/custom_runtime/CMakeLists.txt
+++ b/test/custom_runtime/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set(PLUGIN_URL https://github.com/PaddlePaddle/PaddleCustomDevice.git)
-  set(PLUGIN_TAG develop)
+  set(PLUGIN_TAG release/2.6)
 
   file(
     GLOB TEST_OPS
diff --git a/test/custom_runtime/test_collective_process_group_xccl.py b/test/custom_runtime/test_collective_process_group_xccl.py
index 3c04a59ebfa742..060d76de19664e 100644
--- a/test/custom_runtime/test_collective_process_group_xccl.py
+++ b/test/custom_runtime/test_collective_process_group_xccl.py
@@ -37,7 +37,7 @@ def start_local_trainers(
     )
 
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
@@ -150,7 +150,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -159,6 +159,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_cpu_plugin.py b/test/custom_runtime/test_custom_cpu_plugin.py
index b92df8def9dd30..5478b7ecfad64c 100755
--- a/test/custom_runtime/test_custom_cpu_plugin.py
+++ b/test/custom_runtime/test_custom_cpu_plugin.py
@@ -26,7 +26,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -35,6 +35,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_cpu_profiler_plugin.py b/test/custom_runtime/test_custom_cpu_profiler_plugin.py
index 220c9a0a21aeb1..aeebec9e342c32 100644
--- a/test/custom_runtime/test_custom_cpu_profiler_plugin.py
+++ b/test/custom_runtime/test_custom_cpu_profiler_plugin.py
@@ -24,7 +24,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -33,6 +33,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_cpu_to_static.py b/test/custom_runtime/test_custom_cpu_to_static.py
index 60ba27004afbdd..55181cc017440f 100644
--- a/test/custom_runtime/test_custom_cpu_to_static.py
+++ b/test/custom_runtime/test_custom_cpu_to_static.py
@@ -106,7 +106,7 @@ def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -115,6 +115,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
         )
         os.system(cmd)
diff --git a/test/custom_runtime/test_custom_op_setup.py b/test/custom_runtime/test_custom_op_setup.py
index 47c7d9821d6b8e..2086b3ac6f2ed1 100644
--- a/test/custom_runtime/test_custom_op_setup.py
+++ b/test/custom_runtime/test_custom_op_setup.py
@@ -104,7 +104,7 @@ def setUp(self):
         self.cur_dir = os.path.dirname(os.path.abspath(__file__))
         self.temp_dir = tempfile.TemporaryDirectory()
         cmd = 'cd {} \
-            && git clone --depth 1 {} \
+            && git clone --depth 1 {} -b {} \
             && cd PaddleCustomDevice \
             && git fetch origin \
             && git checkout {} -b dev \
@@ -114,6 +114,7 @@ def setUp(self):
             self.temp_dir.name,
             os.getenv('PLUGIN_URL'),
             os.getenv('PLUGIN_TAG'),
+            os.getenv('PLUGIN_TAG'),
             sys.executable,
             self.cur_dir,
         )
diff --git a/test/custom_runtime/test_fleet_launch_custom_device.sh b/test/custom_runtime/test_fleet_launch_custom_device.sh
index cc851558462399..5cbb3a11d14220 100644
--- a/test/custom_runtime/test_fleet_launch_custom_device.sh
+++ b/test/custom_runtime/test_fleet_launch_custom_device.sh
@@ -18,7 +18,7 @@ set -e
 
 temp_dir=$(mktemp --directory)
 pushd ${temp_dir} \
-&& git clone --depth 1 ${PLUGIN_URL} \
+&& git clone --depth 1 ${PLUGIN_URL} -b ${PLUGIN_TAG} \
 && pushd PaddleCustomDevice/ \
 && git fetch origin \
 && git checkout ${PLUGIN_TAG} -b dev \
diff --git a/test/dygraph_to_static/CMakeLists.txt b/test/dygraph_to_static/CMakeLists.txt
index e2ce58b7cf58c2..bdb9f182e46ada 100644
--- a/test/dygraph_to_static/CMakeLists.txt
+++ b/test/dygraph_to_static/CMakeLists.txt
@@ -8,6 +8,9 @@ set(SOT_ENVS SOT_LOG_LEVEL=0 COST_MODEL=False MIN_GRAPH_SIZE=0
 set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0)
 
 list(REMOVE_ITEM TEST_OPS test_lac)
+list(REMOVE_ITEM TEST_OPS test_grad) # disable test_grad on release/2.6
+list(REMOVE_ITEM TEST_OPS test_sentiment
+)# disable test_sentiment on release/2.6
 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope
 # will be removed and will cause some random failed in multi-thread.
 if(WITH_PYTHON)
@@ -28,6 +31,9 @@ if(NOT WITH_GPU)
   # disable some model test on CPU to avoid timeout
   list(REMOVE_ITEM TEST_OPS test_resnet)
   list(REMOVE_ITEM TEST_OPS test_build_strategy)
+  list(REMOVE_ITEM TEST_OPS test_bert)
+  list(REMOVE_ITEM TEST_OPS test_transformer)
+  list(REMOVE_ITEM TEST_OPS test_mobile_net)
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
@@ -37,15 +43,11 @@ endforeach()
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
 set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS
                                             "RUN_TYPE=EXCLUSIVE")
-set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120)
 set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 420)
 set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 150)
-set_tests_properties(test_bert PROPERTIES TIMEOUT 180)
 set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 240)
 set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120)
-set_tests_properties(test_transformer PROPERTIES TIMEOUT 200)
 set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
-set_tests_properties(test_bert PROPERTIES TIMEOUT 240)
 
 if(NOT WIN32)
   set_tests_properties(test_tsm PROPERTIES TIMEOUT 900)
@@ -53,12 +55,14 @@ endif()
 
 if(APPLE)
   set_tests_properties(test_bmn PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 300)
 endif()
 
 if(WITH_GPU)
   set_tests_properties(test_train_step_resnet18_sgd PROPERTIES TIMEOUT 240)
   set_tests_properties(test_train_step_resnet18_adam PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_bert PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_transformer PROPERTIES TIMEOUT 240)
+  set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 240)
 endif()
 
 # Legacy IR only tests for dygraph_to_static
diff --git a/test/dygraph_to_static/test_list.py b/test/dygraph_to_static/test_list.py
index 52db0e53eb6255..ef3d195d90805d 100644
--- a/test/dygraph_to_static/test_list.py
+++ b/test/dygraph_to_static/test_list.py
@@ -292,6 +292,7 @@ def init_dygraph_func(self):
             test_list_pop_in_while_loop,
         ]
 
+    # TODO(zhangbo): Refine BuildOpFrom for op with sub_block
     def train(self, to_static=False):
         with base.dygraph.guard():
             if to_static:
diff --git a/test/dygraph_to_static/test_mobile_net.py b/test/dygraph_to_static/test_mobile_net.py
index 599d863d12c795..44cf791191a8de 100644
--- a/test/dygraph_to_static/test_mobile_net.py
+++ b/test/dygraph_to_static/test_mobile_net.py
@@ -19,7 +19,10 @@
 import unittest
 
 import numpy as np
-from dygraph_to_static_utils import Dy2StTestBase, test_pt_only
+from dygraph_to_static_utils import (
+    Dy2StTestBase,
+    test_pt_only,
+)
 from predictor_utils import PredictorTools
 
 import paddle
@@ -735,12 +738,6 @@ def assert_same_predict(self, model_name):
         )
 
     @test_pt_only
-    def test_mobile_net_pir(self):
-        # MobileNet-V1
-        self.assert_same_loss("MobileNetV1")
-        # MobileNet-V2
-        self.assert_same_loss("MobileNetV2")
-
     def test_mobile_net(self):
         # MobileNet-V1
         self.assert_same_loss("MobileNetV1")
diff --git a/test/indexing/test_getitem.py b/test/indexing/test_getitem.py
index f3a2374ecbe1d0..3959bde43d1528 100644
--- a/test/indexing/test_getitem.py
+++ b/test/indexing/test_getitem.py
@@ -233,6 +233,26 @@ def test_combined_index_11(self):
 
         np.testing.assert_allclose(y.numpy(), np_res)
 
+    def test_combined_index_12(self):
+        np_data = (
+            np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype)
+        )
+
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+
+        np_res = np_data[:, :, [2, 4], :]
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        y = x[:, :, [2, 4], :]
+
+        if self.dtype == 'bfloat16':
+            y = paddle.cast(y, dtype='float32')
+
+        np.testing.assert_allclose(y.numpy(), np_res)
+
     def test_index_has_range(self):
         np_data = (
             np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6)).astype(self.ndtype)
@@ -970,6 +990,20 @@ def test_combined_index_11(self):
 
         np.testing.assert_allclose(res[0], np_res)
 
+    def test_combined_index_12(self):
+        np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6))
+        np_res = np_data[:, :, [2, 4], :]
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.to_tensor(np_data)
+            y = _getitem_static(
+                x, (slice(None), slice(None), [2, 4], slice(None))
+            )
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_res)
+
     def test_index_has_range(self):
         # only one bool tensor with all False
         np_data = np.arange(3 * 4 * 5 * 6).reshape((3, 4, 5, 6))
diff --git a/test/indexing/test_setitem.py b/test/indexing/test_setitem.py
index b8d7e3361efc45..0f0bdf3d08b8da 100644
--- a/test/indexing/test_setitem.py
+++ b/test/indexing/test_setitem.py
@@ -28,6 +28,21 @@ def setUp(self):
         self.ndtype = np.float64
         self.dtype = 'float64'
 
+    def test_advanced_index(self):
+        np_data = np.zeros((3, 4, 5, 6), dtype='float32').astype(self.ndtype)
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        np_data[[0, 1], [1, 2], [1]] = 10.0
+        x[[0, 1], [1, 2], [1]] = 10.0
+
+        if self.dtype == 'bfloat16':
+            x = paddle.cast(x, dtype='float32')
+        np.testing.assert_allclose(x.numpy(), np_data)
+
     def test_combined_index_1(self):
         np_data = np.zeros((3, 4, 5, 6), dtype='float32').astype(self.ndtype)
         if self.dtype == 'bfloat16':
@@ -228,6 +243,54 @@ def test_indexing_is_boolean_false(self):
 
         np.testing.assert_allclose(x.numpy(), np_data)
 
+    def test_combined_indexing_and_value_is_tensor_2(self):
+        # value is tensor needed to broadcast and index will be adjusted
+        np_data = np.ones((3, 4, 5, 6)).astype(self.ndtype)
+        value_data = np.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1))
+
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+            value_data = convert_uint16_to_float(
+                convert_float_to_uint16(value_data)
+            )
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+            value_data = value_data + 1j * value_data
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        v = paddle.to_tensor(value_data, dtype=self.dtype)
+        x[..., [1, 4], ::2] = v
+
+        np_data[..., [1, 4], ::2] = value_data
+        if self.dtype == 'bfloat16':
+            x = paddle.cast(x, dtype='float32')
+        np.testing.assert_allclose(x.numpy(), np_data)
+
+    def test_combined_indexing_and_value_is_tensor_3(self):
+        # value is tensor and index will be adjusted
+        # and the value rank is less than original tensor
+        np_data = np.ones((3, 4, 5, 6)).astype(self.ndtype)
+        value_data = np.arange(2 * 3 * 5).reshape((2, 3, 5))
+
+        if self.dtype == 'bfloat16':
+            np_data = convert_uint16_to_float(convert_float_to_uint16(np_data))
+            value_data = convert_uint16_to_float(
+                convert_float_to_uint16(value_data)
+            )
+        if self.dtype == 'complex64' or self.dtype == 'complex128':
+            np_data = np_data + 1j * np_data
+            value_data = value_data + 1j * value_data
+
+        x = paddle.to_tensor(np_data, dtype=self.dtype)
+        v = paddle.to_tensor(value_data, dtype=self.dtype)
+        x[:, [1, 3], :, [3, 4]] = v
+
+        np_data[:, [1, 3], :, [3, 4]] = value_data
+
+        if self.dtype == 'bfloat16':
+            x = paddle.cast(x, dtype='float32')
+        np.testing.assert_allclose(x.numpy(), np_data)
+
     def test_inplace_with_stride(self):
         np_v = np.random.randn(3, 1).astype(self.ndtype)
         if self.dtype == 'bfloat16':
@@ -242,12 +305,12 @@ def test_inplace_with_stride(self):
         zero.stop_gradient = False
 
         zero1 = zero * 1
-        zero1[paddle.to_tensor([0, 1])] = vv
+        zero1[1, paddle.to_tensor([2, 0, 1])] = vv
 
         loss = zero1.sum()
         loss.backward()
 
-        expected_v_grad = np.ones((3, 1)) * 10.0
+        expected_v_grad = np.ones((3, 1)) * 5.0
         if self.dtype == 'bfloat16':
             np.testing.assert_allclose(
                 v.grad.cast('float32').numpy(), expected_v_grad
@@ -574,6 +637,69 @@ def test_indexing_is_boolean_false(self):
 
         np.testing.assert_allclose(res[0], np_data)
 
+    def test_combined_indexing_and_value_is_tensor_1(self):
+        # value is tensor with same shape to getitem and index will be adjusted
+        np_data = np.ones((3, 3), dtype='int32')
+        value_data = np.array([-1, -1, -1])
+        np_data[:, [0, 2]] = np_data[:, [0, 2]] * np.expand_dims(value_data, -1)
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 3), dtype='int32')
+            v = paddle.to_tensor([-1, -1, -1])
+            y = _setitem_static(
+                x,
+                (slice(None), [0, 2]),
+                x[:, [0, 2]] * v.unsqueeze(-1),
+            )
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_data)
+
+    def test_combined_indexing_and_value_is_tensor_2(self):
+        # value is tensor needed to broadcast and index will be adjusted
+        np_data = np.ones((3, 4, 5, 6), dtype='int32')
+        value_data = np.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1))
+        np_data[..., [1, 4], ::2] = value_data
+
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 4, 5, 6), dtype='int32')
+            v = paddle.arange(3 * 4 * 2 * 1).reshape((3, 4, 2, 1))
+
+            y = _setitem_static(
+                x,
+                (..., [1, 4], slice(None, None, 2)),
+                v,
+            )
+
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_data)
+
+    def test_combined_indexing_and_value_is_tensor_3(self):
+        # value is tensor and index will be adjusted
+        # and the value rank is less than original tensor
+        np_data = np.ones((3, 4, 5, 6), dtype='int32')
+        value_data = np.arange(2 * 3 * 5).reshape((2, 3, 5))
+        np_data[:, [1, 3], :, [3, 4]] = value_data
+
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.ones((3, 4, 5, 6), dtype='int32')
+            v = paddle.arange(2 * 3 * 5).reshape((2, 3, 5))
+            y = _setitem_static(
+                x,
+                (slice(None), [1, 3], slice(None), [3, 4]),
+                v,
+            )
+
+            res = self.exe.run(fetch_list=[y])
+
+        np.testing.assert_allclose(res[0], np_data)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/ir/inference/program_config.py b/test/ir/inference/program_config.py
index a2f36617de8e6e..f9424502484ccb 100644
--- a/test/ir/inference/program_config.py
+++ b/test/ir/inference/program_config.py
@@ -275,6 +275,7 @@ def generate_weight():
         self.outputs = outputs
         self.input_type = input_type
         self.no_cast_list = [] if no_cast_list is None else no_cast_list
+        self.supported_cast_type = [np.float32, np.float16]
 
     def __repr__(self):
         log_str = ''
@@ -292,11 +293,9 @@ def __repr__(self):
         return log_str
 
     def set_input_type(self, _type: np.dtype) -> None:
-        assert _type in [
-            np.float32,
-            np.float16,
-            None,
-        ], "PaddleTRT only supports FP32 / FP16 IO"
+        assert (
+            _type in self.supported_cast_type or _type is None
+        ), "PaddleTRT only supports FP32 / FP16 IO"
 
         ver = paddle.inference.get_trt_compile_version()
         trt_version = ver[0] * 1000 + ver[1] * 100 + ver[2] * 10
@@ -309,15 +308,14 @@ def set_input_type(self, _type: np.dtype) -> None:
     def get_feed_data(self) -> Dict[str, Dict[str, Any]]:
         feed_data = {}
         for name, tensor_config in self.inputs.items():
-            do_casting = (
-                self.input_type is not None and name not in self.no_cast_list
-            )
+            data = tensor_config.data
             # Cast to target input_type
-            data = (
-                tensor_config.data.astype(self.input_type)
-                if do_casting
-                else tensor_config.data
-            )
+            if (
+                self.input_type is not None
+                and name not in self.no_cast_list
+                and data.dtype in self.supported_cast_type
+            ):
+                data = data.astype(self.input_type)
             # Truncate FP32 tensors to FP16 precision for FP16 test stability
             if data.dtype == np.float32 and name not in self.no_cast_list:
                 data = data.astype(np.float16).astype(np.float32)
@@ -334,10 +332,14 @@ def _cast(self) -> None:
         for name, inp in self.inputs.items():
             if name in self.no_cast_list:
                 continue
+            if inp.dtype not in self.supported_cast_type:
+                continue
             inp.convert_type_inplace(self.input_type)
         for name, weight in self.weights.items():
             if name in self.no_cast_list:
                 continue
+            if weight.dtype not in self.supported_cast_type:
+                continue
             weight.convert_type_inplace(self.input_type)
         return self
 
diff --git a/test/ir/inference/test_trt_convert_assign.py b/test/ir/inference/test_trt_convert_assign.py
index 55939982d5ee0d..99b027877bc9cb 100644
--- a/test/ir/inference/test_trt_convert_assign.py
+++ b/test/ir/inference/test_trt_convert_assign.py
@@ -120,9 +120,8 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if not dynamic_shape and (
-                self.has_bool_dtype or self.dims == 1 or self.dims == 0
-            ):
+            # Static shape does not support 0 or 1 dim's input
+            if not dynamic_shape and (self.dims == 1 or self.dims == 0):
                 return 0, 4
             return 1, 2
 
diff --git a/test/ir/inference/test_trt_convert_cast.py b/test/ir/inference/test_trt_convert_cast.py
index 026abc571050a2..0b5f2186429e3e 100644
--- a/test/ir/inference/test_trt_convert_cast.py
+++ b/test/ir/inference/test_trt_convert_cast.py
@@ -118,6 +118,7 @@ def generate_input(type):
                             )
                         },
                         outputs=["cast_output_data1"],
+                        no_cast_list=["input_data"],
                     )
 
                     yield program_config
diff --git a/test/ir/inference/test_trt_convert_lookup_table.py b/test/ir/inference/test_trt_convert_lookup_table.py
index e1fb64bcdf545f..b7cf7d657d7a02 100644
--- a/test/ir/inference/test_trt_convert_lookup_table.py
+++ b/test/ir/inference/test_trt_convert_lookup_table.py
@@ -80,6 +80,7 @@ def generate_input2(dims, attrs: List[Dict[str, Any]]):
                     )
                 },
                 outputs=["out_data"],
+                no_cast_list=["indices"],
             )
 
             yield program_config
diff --git a/test/ir/inference/test_trt_convert_solve.py b/test/ir/inference/test_trt_convert_solve.py
index c3f9b51d0d05c2..de70cfacc4e071 100644
--- a/test/ir/inference/test_trt_convert_solve.py
+++ b/test/ir/inference/test_trt_convert_solve.py
@@ -87,11 +87,10 @@ def clear_dynamic_shape():
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        program_config.set_input_type(np.float32)
         yield self.create_inference_config(), (1, 3), 1e-5
+
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        program_config.set_input_type(np.float16)
-        yield self.create_inference_config(), (1, 3), 1e-3
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 9c875f6755187a..476d8f5d02a88f 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -213,7 +213,7 @@ elseif(WITH_GPU)
   endif()
 endif()
 
-if((NOT WITH_NCCL) AND (NOT WITH_RCCL) AND (NOT WITH_MCCL))
+if((NOT WITH_NCCL) AND (NOT WITH_RCCL))
   list(REMOVE_ITEM TEST_OPS test_imperative_group)
 endif()
 
@@ -550,9 +550,9 @@ if((NOT WITH_GPU)
 endif()
 
 list(REMOVE_ITEM TEST_OPS "test_stride")
+list(REMOVE_ITEM TEST_OPS "test_graph_reindex")
 if(WITH_COVERAGE)
   list(REMOVE_ITEM TEST_OPS test_weight_decay)
-  list(REMOVE_ITEM TEST_OPS test_graph_reindex)
   list(REMOVE_ITEM TEST_OPS test_cuda_graphed_layer)
   list(REMOVE_ITEM TEST_OPS test_cuda_graph_partial_graph_static_run)
   list(REMOVE_ITEM DIST_TEST_OPS test_dist_fleet_geo)
diff --git a/test/legacy_test/c_embedding_op_base.py b/test/legacy_test/c_embedding_op_base.py
index 83758b6bb0bc98..cfb9df8e69d22d 100644
--- a/test/legacy_test/c_embedding_op_base.py
+++ b/test/legacy_test/c_embedding_op_base.py
@@ -34,10 +34,8 @@ def get_c_embedding(start, end, table, ids):
     return output
 
 
-def c_embedding_wrapper(table, index, start_index=0):
-    return paddle._legacy_C_ops.c_embedding(
-        table, index, "start_index", start_index
-    )
+def c_embedding_wrapper(table, index, start_index=0, vocab_size=-1):
+    return paddle._C_ops.c_embedding(table, index, start_index, vocab_size)
 
 
 class TestCEmbeddingCPU(OpTest):
@@ -58,11 +56,15 @@ def initcase(self):
         )
         self.start_index = 10
         self.end_index = self.start_index + 17
+        self.vocab_size = 34
 
         self.inputs = {'W': table, 'Ids': ids}
         np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
         self.outputs = {'Out': np_out.reshape((2, 4, 64))}
-        self.attrs = {'start_index': self.start_index}
+        self.attrs = {
+            'start_index': self.start_index,
+            'vocab_size': self.vocab_size,
+        }
         if core.is_compiled_with_xpu():
             self.__class__.use_xpu = True
 
@@ -87,12 +89,20 @@ def test_check_output(self):
             self.check_output_with_place(core.CUDAPlace(0))
         elif core.is_compiled_with_xpu():
             self.check_output_with_place(core.XPUPlace(0))
+        else:
+            current_place = paddle.framework._current_expected_place()
+            if isinstance(current_place, paddle.CustomPlace):
+                self.check_output_with_place(current_place)
 
     def test_check_grad(self):
         if core.is_compiled_with_cuda():
             self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
         elif core.is_compiled_with_xpu():
             self.check_grad_with_place(core.XPUPlace(0), ['W'], 'Out')
+        else:
+            current_place = paddle.framework._current_expected_place()
+            if isinstance(current_place, paddle.CustomPlace):
+                self.check_grad_with_place(current_place, ['W'], 'Out')
 
     def init_dtype(self):
         if core.is_compiled_with_cuda():
@@ -101,6 +111,11 @@ def init_dtype(self):
         elif core.is_compiled_with_xpu():
             self.dtype = "float32"
             self.ids_dtype = "int64"
+        else:
+            current_place = paddle.framework._current_expected_place()
+            if isinstance(current_place, paddle.CustomPlace):
+                self.dtype = "float32"
+                self.ids_dtype = "int64"
 
 
 class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
diff --git a/test/legacy_test/test_adaptive_avg_pool1d.py b/test/legacy_test/test_adaptive_avg_pool1d.py
index 4f0ee276f93dea..bca37ba88794f8 100644
--- a/test/legacy_test/test_adaptive_avg_pool1d.py
+++ b/test/legacy_test/test_adaptive_avg_pool1d.py
@@ -127,7 +127,6 @@ def check_adaptive_avg_static_results(self, place):
             np.testing.assert_allclose(fetches[0], result_np, rtol=1e-05)
 
     def test_adaptive_avg_pool1d(self):
-        paddle.enable_static()
         for place in self.places:
             self.check_adaptive_avg_dygraph_results(place)
             self.check_adaptive_avg_static_results(place)
diff --git a/test/legacy_test/test_dist_hapi_model.py b/test/legacy_test/test_dist_hapi_model.py
index a439b517e851ef..03a92d6f3cbc91 100644
--- a/test/legacy_test/test_dist_hapi_model.py
+++ b/test/legacy_test/test_dist_hapi_model.py
@@ -60,7 +60,7 @@ def start_local_trainers(
     log_dir=None,
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/legacy_test/test_download.py b/test/legacy_test/test_download.py
index 742c4b2a651902..da25a3021a31e0 100644
--- a/test/legacy_test/test_download.py
+++ b/test/legacy_test/test_download.py
@@ -120,14 +120,6 @@ def test_retry_exception(
                 './test',
             )
 
-    def test_wget_download_error(
-        self,
-    ):
-        with self.assertRaises(RuntimeError):
-            from paddle.utils.download import _download
-
-            _download('www.baidu', './test', method='wget')
-
     def test_download_methods(
         self,
     ):
@@ -136,14 +128,9 @@ def test_download_methods(
             "https://paddle-hapi.bj.bcebos.com/unittest/files.zip",
         ]
 
-        import sys
-
         from paddle.utils.download import _download
 
-        if sys.platform == 'linux':
-            methods = ['wget', 'get']
-        else:
-            methods = ['get']
+        methods = ['get']
 
         for url in urls:
             for method in methods:
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel.py b/test/legacy_test/test_parallel_dygraph_dataparallel.py
index 5a9bb153c3cc9d..917f8f2ff1bf17 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel.py
@@ -107,7 +107,7 @@ def start_local_trainers(
     need_envs={},
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
index b61b2a556f8516..5a944284414bf0 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
@@ -55,7 +55,7 @@ def start_local_trainers(
     cluster, pod, training_script, training_script_args, log_dir=None
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/legacy_test/test_put_along_axis_op.py b/test/legacy_test/test_put_along_axis_op.py
index 43d2e80c25e24a..47cfc65d617136 100644
--- a/test/legacy_test/test_put_along_axis_op.py
+++ b/test/legacy_test/test_put_along_axis_op.py
@@ -120,6 +120,470 @@ def init_data(self):
         self.axis_type = "int64"
 
 
+class TestPutAlongAxisOpMul(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mul"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMulNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mul"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.zeros_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    if self.nums[i, self.index[i, j, k], k] == 0:
+                        self.target[i, self.index[i, j, k], k] = self.value[
+                            i, j, k
+                        ]
+                    else:
+                        self.target[i, self.index[i, j, k], k] *= self.value[
+                            i, j, k
+                        ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpAdd(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "add"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] += self.value[
+                        i, j, k
+                    ]
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 100, (5, 5, 5)).astype(
+            self.value_type
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpAddNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "add"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.zeros_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    if self.nums[i, self.index[i, j, k], k] == 0:
+                        self.target[i, self.index[i, j, k], k] = self.value[
+                            i, j, k
+                        ]
+                    else:
+                        self.target[i, self.index[i, j, k], k] += self.value[
+                            i, j, k
+                        ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMean(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mean"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.ones_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] += self.value[
+                        i, j, k
+                    ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        for i in range(10):
+            for j in range(10):
+                for k in range(10):
+                    self.target[i, j, k] /= self.nums[i, j, k]
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMeanNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "mean"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        self.nums = np.zeros_like(self.target)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    if self.nums[i, self.index[i, j, k], k] == 0:
+                        self.target[i, self.index[i, j, k], k] = self.value[
+                            i, j, k
+                        ]
+                    else:
+                        self.target[i, self.index[i, j, k], k] += self.value[
+                            i, j, k
+                        ]
+                    self.nums[i, self.index[i, j, k], k] += 1
+        for i in range(10):
+            for j in range(10):
+                for k in range(10):
+                    if self.nums[i, j, k] > 0:
+                        self.target[i, j, k] = (
+                            self.target[i, j, k] / self.nums[i, j, k]
+                        )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMin(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amin"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        < self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMinNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amin"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = self.value[i, j, k]
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        < self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMax(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amax"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        > self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'include_self': True,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
+class TestPutAlongAxisOpMaxNotIncludeSelf(TestPutAlongAxisOp):
+    def setUp(self):
+        self.init_data()
+        self.reduce_op = "amax"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = self.value[i, j, k]
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] = (
+                        self.value[i, j, k]
+                        if self.value[i, j, k]
+                        > self.target[i, self.index[i, j, k], k]
+                        else self.target[i, self.index[i, j, k], k]
+                    )
+        self.inputs = {
+            'Input': self.xnp,
+            'Index': self.index,
+            'Value': self.value,
+        }
+        self.attrs = {
+            'Axis': self.axis,
+            'Reduce': self.reduce_op,
+            'Include_self': False,
+            'broadcast': False,
+        }
+        self.outputs = {'Result': self.target}
+
+    def init_data(self):
+        self.dtype = 'float64'
+        self.x_type = "float64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float64"
+        self.value = (
+            np.arange(1, 126).reshape((5, 5, 5)).astype(self.value_type)
+        )
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+
+
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
@@ -274,6 +738,45 @@ def run(place):
             run(place)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPILargeCase(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.shape = [64, 1327104]
+        self.index_shape = [64, 1327104]
+        self.index_np = np.zeros(self.index_shape).astype('int64')
+        self.x_np = np.random.random(self.shape).astype(np.float32)
+        self.axis = 1
+        self.value_np = np.ones(self.index_shape).astype(np.float32)
+        self.x_feed = copy.deepcopy(self.x_np)
+        self.place = [paddle.CUDAPlace(0)]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.x_np)
+            index_tensor = paddle.to_tensor(self.index_np)
+            value_tensor = paddle.to_tensor(self.value_np)
+            out = paddle.put_along_axis(
+                x_tensor, index_tensor, value_tensor, self.axis
+            )
+            np.array(
+                np.put_along_axis(
+                    self.x_np, self.index_np, self.value_np, self.axis
+                )
+            )
+            out_ref = self.x_np
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+            paddle.enable_static()
+
+        for place in self.place:
+            run(place)
+
+
 class TestPutAlongAxisAPICase2(TestPutAlongAxisAPI):
     def setUp(self):
         np.random.seed(0)
@@ -468,13 +971,262 @@ def test_error(self):
         except Exception as error:
             self.assertIsInstance(error, RuntimeError)
 
-        # use includ_self=False
-        try:
+    def test_index_type_error(self):
+        tensorx = paddle.to_tensor([[1, 2, 3], [4, 5, 6]]).astype("float32")
+        indices = paddle.to_tensor([[1]]).astype("float32")
+        values = paddle.to_tensor([[2]])
+        with self.assertRaises(TypeError):
             res = paddle.put_along_axis(
-                tensorx, indices, 1.0, 0, 'assign', False
+                tensorx, indices, values, 0, 'mul', True, False
             )
-        except Exception as error:
-            self.assertIsInstance(error, ValueError)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulFloat32(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'float32'
+        self.x_type = "float32"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float32"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.random.randint(0, 5, (5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestPutAlongAxisAPIMulBF16(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'float32'
+        self.x_type = "float32"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "float32"
+        self.value = np.random.randint(1, 3, (3, 3, 3)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.random.randint(0, 3, (3, 3, 3)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(3):
+            for j in range(3):
+                for k in range(3):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+        self.xnp = convert_float_to_uint16(self.xnp)
+        self.value = convert_float_to_uint16(self.value)
+        self.target = convert_float_to_uint16(self.target)
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulInt32(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'int32'
+        self.x_type = "int32"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "int32"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int32"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulInt64(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'int64'
+        self.x_type = "int64"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "int64"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(),
+    "core is not complied with CUDA",
+)
+class TestPutAlongAxisAPIMulUint8(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(0)
+        self.dtype = 'uint8'
+        self.x_type = "uint8"
+        self.x_shape = (10, 10, 10)
+        self.value_type = "uint8"
+        self.value = np.random.randint(1, 5, (5, 5, 5)).astype(self.value_type)
+        self.index_type = "int64"
+        self.index = np.zeros((5, 5, 5)).astype(self.index_type)
+        self.axis = 1
+        self.axis_type = "int64"
+        self.op_type = "put_along_axis"
+        self.python_api = paddle.tensor.put_along_axis
+        self.xnp = np.random.randint(1, 5, self.x_shape).astype(self.x_type)
+        # numpy put_along_axis is an inplace operation.
+        self.target = copy.deepcopy(self.xnp)
+        for i in range(5):
+            for j in range(5):
+                for k in range(5):
+                    self.target[i, self.index[i, j, k], k] *= self.value[
+                        i, j, k
+                    ]
+
+    def test_api_dygraph(self):
+        def run(place):
+            paddle.disable_static(place)
+            x_tensor = paddle.to_tensor(self.xnp)
+            index_tensor = paddle.to_tensor(self.index)
+            value_tensor = paddle.to_tensor(self.value)
+            out = paddle.put_along_axis(
+                x_tensor,
+                index_tensor,
+                value_tensor,
+                self.axis,
+                "mul",
+                True,
+                False,
+            )
+            out_ref = self.target
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=0.001)
+
+        run(paddle.CUDAPlace(0))
 
 
 if __name__ == "__main__":
diff --git a/test/legacy_test/test_repeat_interleave_op.py b/test/legacy_test/test_repeat_interleave_op.py
index b2d0a12c6e260d..60d11a813263e5 100644
--- a/test/legacy_test/test_repeat_interleave_op.py
+++ b/test/legacy_test/test_repeat_interleave_op.py
@@ -252,6 +252,25 @@ def test_dygraph_api(self):
         expect_out = np.repeat(input_x, index, axis=None)
         np.testing.assert_allclose(expect_out, np_z, rtol=1e-05)
 
+        # case input dtype is bfloat16
+        input_x = np.array([[1, 2, 1], [1, 2, 3]]).astype('uint16')
+
+        with base.dygraph.guard():
+            x = paddle.to_tensor(input_x)
+            index = paddle.to_tensor(index_x)
+            z = paddle.repeat_interleave(x, index, None)
+            np_z = z.numpy()
+        expect_out = np.repeat(input_x, index_x, axis=None)
+        np.testing.assert_allclose(expect_out, np_z, rtol=1e-05)
+
+        with base.dygraph.guard():
+            x = paddle.to_tensor(input_x)
+            index = 2
+            z = paddle.repeat_interleave(x, index, None)
+            np_z = z.numpy()
+        expect_out = np.repeat(input_x, index, axis=None)
+        np.testing.assert_allclose(expect_out, np_z, rtol=1e-05)
+
         # case 1:
         with base.dygraph.guard():
             x = base.dygraph.to_variable(self.data_x)
diff --git a/test/legacy_test/test_set_value_op.py b/test/legacy_test/test_set_value_op.py
index 65c9f69765d116..c42026fb9caee1 100644
--- a/test/legacy_test/test_set_value_op.py
+++ b/test/legacy_test/test_set_value_op.py
@@ -1978,5 +1978,87 @@ def test_check_grad(self):
         self.check_grad_with_place(place, ['Input'], 'Out', check_dygraph=False)
 
 
+class TestSetValueWithScalarInStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        self.shape = (10, 2)
+        self.exe = paddle.static.Executor()
+        self.train_program = paddle.static.Program()
+        self.startup_program = paddle.static.Program()
+
+    def test_value_input_is_scalar(self):
+        with paddle.static.program_guard(
+            self.train_program, self.startup_program
+        ):
+            x = paddle.ones(self.shape)
+            x.stop_gradient = False
+            y = x * 1
+
+            # mock test case x[0, 0] = 10 with no ValueTensor input
+            inputs = {
+                'Input': y,
+            }
+            attrs = {
+                'axes': [0, 1],
+                'starts': [0, 0],
+                'ends': [1, 1],
+                'steps': [1, 1],
+                'values': [10],
+                'shape': [1],
+            }
+
+            helper = LayerHelper("set_value")
+            out = helper.create_variable_for_type_inference(dtype=y.dtype)
+
+            helper.append_op(
+                type="set_value",
+                inputs=inputs,
+                outputs={'Out': out},
+                attrs=attrs,
+            )
+
+            np_data = np.ones(self.shape).astype('float32')
+
+            paddle.static.append_backward(out.sum())
+            res = self.exe.run(
+                self.train_program, fetch_list=[out, x.grad_name]
+            )
+
+            np_data[0, 0] = 10
+            expected_x_grad = np.ones(self.shape)
+            expected_x_grad[0, 0] = 0
+
+        np.testing.assert_array_equal(res[0], np_data)
+        np.testing.assert_array_equal(res[1], expected_x_grad)
+
+
+class TestSetValueWithScalarInDygraph(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shape = (10, 2)
+
+    def test_value_input_is_scalar(self):
+        x = paddle.ones(self.shape)
+        x.stop_gradient = False
+        y = x * 1
+
+        # mock test case x[0, 0] = 10 with no ValueTensor input
+        out = paddle._C_ops.set_value(
+            y, [0, 0], [1, 1], [1, 1], [0, 1], [], [], [1], [10.0]
+        )
+
+        loss = out.sum()
+        loss.backward()
+
+        np_data = np.ones(self.shape).astype('float32')
+        np_data[0, 0] = 10
+
+        expected_x_grad = np.ones(self.shape)
+        expected_x_grad[0, 0] = 0
+
+        np.testing.assert_array_equal(out, np_data)
+        np.testing.assert_array_equal(x.grad, expected_x_grad)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/legacy_test/test_sparse_fused_attention_op.py b/test/legacy_test/test_sparse_fused_attention_op.py
index 68cdd16d4bd12c..098f4815b85f38 100644
--- a/test/legacy_test/test_sparse_fused_attention_op.py
+++ b/test/legacy_test/test_sparse_fused_attention_op.py
@@ -42,6 +42,7 @@ def get_cuda_version():
 )
 class TestSparseAttentionAPI1(unittest.TestCase):
     def setUp(self):
+        paddle.seed(0)
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 128
@@ -134,6 +135,7 @@ def test_dygraph(self):
 
 class TestSparseAttentionAPI2(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 128
@@ -144,6 +146,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI3(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
@@ -154,6 +157,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI4(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
@@ -164,6 +168,7 @@ def setUp(self):
 
 class TestSparseAttentionAPI5(TestSparseAttentionAPI1):
     def setUp(self):
+        super().setUp()
         self.batch_size = 16
         self.num_heads = 16
         self.seq_len = 512
diff --git a/test/legacy_test/test_yolov3_loss_op.py b/test/legacy_test/test_yolov3_loss_op.py
index e39c7ca94443c4..61984ffce1a56e 100644
--- a/test/legacy_test/test_yolov3_loss_op.py
+++ b/test/legacy_test/test_yolov3_loss_op.py
@@ -21,7 +21,7 @@
 import paddle
 from paddle.base import core
 from paddle.pir_utils import test_with_pir_api
-paddle.enable_static()
+
 
 def l1loss(x, y):
     return abs(x - y)
@@ -443,7 +443,6 @@ def test_dygraph(self):
 class TestYolov3LossStatic(unittest.TestCase):
     @test_with_pir_api
     def test_static(self):
-        paddle.enable_static()
         x = paddle.static.data('x', [2, 14, 8, 8], 'float32')
         gt_box = paddle.static.data('gt_box', [2, 10, 4], 'float32')
         gt_label = paddle.static.data('gt_label', [2, 10], 'int32')
diff --git a/test/quantization/test_groupwise.py b/test/quantization/test_groupwise.py
new file mode 100644
index 00000000000000..aef864fd2713bd
--- /dev/null
+++ b/test/quantization/test_groupwise.py
@@ -0,0 +1,69 @@
+# copyright (c) 2023 paddlepaddle authors. all rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import paddle
+from paddle.nn import Linear, Sequential
+from paddle.quantization import PTQ, QuantConfig
+from paddle.quantization.observers import GroupWiseWeightObserver
+
+
+class LinearDygraph(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        self.fc = Sequential(
+            Linear(128, 128), Linear(128, 128), Linear(128, 128)
+        )
+
+    def forward(self, inputs):
+        out = self.fc(inputs)
+        return out
+
+
+class TestPTQGroupWise(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(self.temp_dir.name, 'ptq')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def _get_model_for_ptq(self):
+        observer = GroupWiseWeightObserver(quant_bits=4, group_size=128)
+        model = LinearDygraph()
+        model.eval()
+        q_config = QuantConfig(activation=None, weight=observer)
+        ptq = PTQ(q_config)
+        quant_model = ptq.quantize(model)
+        return quant_model, ptq
+
+    def _count_layers(self, model, layer_type):
+        count = 0
+        for _layer in model.sublayers(True):
+            if isinstance(_layer, layer_type):
+                count += 1
+        return count
+
+    def test_quantize(self):
+        ptq_model, _ = self._get_model_for_ptq()
+        inputs = paddle.rand([128, 128], dtype="float32")
+        out = ptq_model(inputs)
+        self.assertIsNotNone(out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/quantization/test_llm_int8_linear.py b/test/quantization/test_llm_int8_linear.py
index 5a35b0d5124616..e4920f198f2c6f 100644
--- a/test/quantization/test_llm_int8_linear.py
+++ b/test/quantization/test_llm_int8_linear.py
@@ -15,12 +15,11 @@
 import unittest
 
 import numpy as np
-from test_weight_only_linear import convert_uint16_to_float, get_cuda_version
+from test_weight_only_linear import convert_uint16_to_float
 
 import paddle
 import paddle.nn.quant as Q
 from paddle import base
-from paddle.base import core
 from paddle.base.framework import default_main_program
 from paddle.framework import set_default_dtype
 from paddle.pir_utils import test_with_pir_api
@@ -30,12 +29,7 @@
 default_main_program().random_seed = 42
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase(unittest.TestCase):
     def config(self):
         self.dtype = 'float16'
@@ -149,12 +143,7 @@ def test_llm_int8_linear(self):
         )
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase1(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -162,12 +151,7 @@ def config(self):
         self.weight_dtype = "int8"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase2(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -176,12 +160,7 @@ def config(self):
         self.weight_dtype = "int8"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase3(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -189,13 +168,7 @@ def config(self):
         self.weight_dtype = "int8"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8 or core is not support bfloat16",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase4(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -203,12 +176,7 @@ def config(self):
         self.weight_dtype = "int4"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase5(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -217,13 +185,7 @@ def config(self):
         self.weight_dtype = "int4"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8 or core is not support bfloat16",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase6(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -231,12 +193,7 @@ def config(self):
         self.weight_dtype = "int4"
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase7(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -246,12 +203,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase8(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -262,12 +214,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase9(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -277,12 +224,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCase10(LLMInt8LinearTestCase):
     def config(self):
         super().config()
@@ -293,13 +235,7 @@ def config(self):
         self.token = 1
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_compiled_with_cuda()
-    or get_cuda_version() < 11020
-    or paddle.device.cuda.get_device_capability()[0] < 8,
-    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
-)
+@unittest.skipIf(True, "Disable this unit test in release/2.6")
 class LLMInt8LinearTestCaseStatic(LLMInt8LinearTestCase):
     def config(self):
         super().config()
diff --git a/test/quantization/test_post_training_quantization_mobilenetv1.py b/test/quantization/test_post_training_quantization_mobilenetv1.py
index 4500f61ca13dc6..113b2cb066b915 100644
--- a/test/quantization/test_post_training_quantization_mobilenetv1.py
+++ b/test/quantization/test_post_training_quantization_mobilenetv1.py
@@ -25,6 +25,7 @@
 
 import paddle
 from paddle.dataset.common import download
+from paddle.io import Dataset
 from paddle.static.log_helper import get_logger
 from paddle.static.quantization import PostTrainingQuantization
 
@@ -116,6 +117,33 @@ def val(data_dir=DATA_DIR):
     return _reader_creator(file_list, 'val', shuffle=False, data_dir=data_dir)
 
 
+class ImageNetDataset(Dataset):
+    def __init__(self, data_dir=DATA_DIR, shuffle=False, need_label=False):
+        super().__init__()
+        self.need_label = need_label
+        self.data_dir = data_dir
+        val_file_list = os.path.join(data_dir, 'val_list.txt')
+        with open(val_file_list) as flist:
+            lines = [line.strip() for line in flist]
+            if shuffle:
+                np.random.shuffle(lines)
+            self.data = [line.split() for line in lines]
+
+    def __getitem__(self, index):
+        sample = self.data[index]
+        data_path = os.path.join(self.data_dir, sample[0])
+        data, label = process_image(
+            [data_path, sample[1]], mode='val', color_jitter=False, rotate=False
+        )
+        if self.need_label:
+            return data, np.array([label]).astype('int64')
+        else:
+            return data
+
+    def __len__(self):
+        return len(self.data)
+
+
 class TestPostTrainingQuantization(unittest.TestCase):
     def setUp(self):
         self.int8_download = 'int8/download'
@@ -267,7 +295,7 @@ def run_program(
         throughput = cnt / np.sum(periods)
         latency = np.average(periods)
         acc1 = np.sum(test_info) / cnt
-        return (throughput, latency, acc1)
+        return (throughput, latency, acc1, feed_dict)
 
     def generate_quantized_model(
         self,
@@ -284,6 +312,7 @@ def generate_quantized_model(
         batch_nums=1,
         onnx_format=False,
         deploy_backend=None,
+        feed_name="inputs",
     ):
         try:
             os.system("mkdir " + self.int8_model)
@@ -293,11 +322,30 @@ def generate_quantized_model(
 
         place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
-        val_reader = val()
+        image = paddle.static.data(
+            name=feed_name[0], shape=[None, 3, 224, 224], dtype='float32'
+        )
+        feed_list = [image]
+        if len(feed_name) == 2:
+            label = paddle.static.data(
+                name='label', shape=[None, 1], dtype='int64'
+            )
+            feed_list.append(label)
+
+        val_dataset = ImageNetDataset(need_label=len(feed_list) == 2)
+        data_loader = paddle.io.DataLoader(
+            val_dataset,
+            places=place,
+            feed_list=feed_list,
+            drop_last=False,
+            return_list=False,
+            batch_size=2,
+            shuffle=False,
+        )
 
         ptq = PostTrainingQuantization(
             executor=exe,
-            sample_generator=val_reader,
+            data_loader=data_loader,
             model_dir=model_path,
             model_filename=model_filename,
             params_filename=params_filename,
@@ -348,7 +396,12 @@ def run_test(
                 model, infer_iterations * batch_size
             )
         )
-        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+        (
+            fp32_throughput,
+            fp32_latency,
+            fp32_acc1,
+            feed_name,
+        ) = self.run_program(
             model_path,
             model_filename,
             params_filename,
@@ -370,6 +423,7 @@ def run_test(
             batch_nums,
             onnx_format,
             deploy_backend,
+            feed_name,
         )
 
         _logger.info(
@@ -377,7 +431,7 @@ def run_test(
                 model, infer_iterations * batch_size
             )
         )
-        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+        (int8_throughput, int8_latency, int8_acc1, _) = self.run_program(
             self.int8_model,
             model_filename,
             params_filename,
@@ -421,7 +475,7 @@ def test_post_training_kl_mobilenetv1(self):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.025
-        batch_nums = 1
+        batch_nums = 2
         self.run_test(
             model,
             'inference.pdmodel',
@@ -607,7 +661,7 @@ def test_post_training_onnx_format_mobilenetv1_tensorrt(self):
         is_optimize_model = False
         onnx_format = True
         diff_threshold = 0.05
-        batch_nums = 2
+        batch_nums = 12
         deploy_backend = "tensorrt"
         self.run_test(
             model,
@@ -650,7 +704,7 @@ def test_post_training_onnx_format_mobilenetv1_mkldnn(self):
         is_optimize_model = False
         onnx_format = True
         diff_threshold = 0.05
-        batch_nums = 1
+        batch_nums = 12
         deploy_backend = "mkldnn"
         self.run_test(
             model,
diff --git a/test/quantization/test_post_training_quantization_resnet50.py b/test/quantization/test_post_training_quantization_resnet50.py
index ca87f17572a4c3..895b2f170084dc 100644
--- a/test/quantization/test_post_training_quantization_resnet50.py
+++ b/test/quantization/test_post_training_quantization_resnet50.py
@@ -113,7 +113,7 @@ def run_program(
         throughput = cnt / np.sum(periods)
         latency = np.average(periods)
         acc1 = np.sum(test_info) / cnt
-        return (throughput, latency, acc1)
+        return (throughput, latency, acc1, feed_dict)
 
 
 class TestPostTrainingForResnet50ONNXFormat(TestPostTrainingForResnet50):
diff --git a/test/quantization/test_ptq.py b/test/quantization/test_ptq.py
index 29ef308bd0b54e..2c6c21d472665f 100644
--- a/test/quantization/test_ptq.py
+++ b/test/quantization/test_ptq.py
@@ -128,6 +128,48 @@ def test_convert(self):
         self.assertIsNotNone(results)
         paddle.disable_static()
 
+    def test_convert_2times(self):
+        quant_model, ptq = self._get_model_for_ptq()
+
+        image = paddle.rand([1, 1, 32, 32], dtype="float32")
+        converted_model = ptq.convert(quant_model)
+        converted_model = ptq.convert(converted_model)
+        out = converted_model(image)
+        self.assertIsNotNone(out)
+
+        observer_count = self._count_layers(
+            converted_model, AbsmaxObserverLayer
+        )
+        quanter_count = self._count_layers(converted_model, LinearQuanter)
+        dequanter_count = self._count_layers(converted_model, LinearDequanter)
+        self.assertEqual(observer_count, 0)
+        self.assertEqual(dequanter_count, 14)
+        self.assertEqual(quanter_count, 9)
+
+        save_path = os.path.join(self.temp_dir.name, 'int8_infer')
+        paddle.jit.save(converted_model, save_path, [image])
+
+        paddle.enable_static()
+        exe = paddle.static.Executor(paddle.CPUPlace())
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            [
+                inference_program,
+                feed_target_names,
+                fetch_targets,
+            ] = paddle.static.load_inference_model(save_path, exe)
+        tensor_img = np.array(
+            np.random.random((1, 1, 32, 32)), dtype=np.float32
+        )
+        results = exe.run(
+            inference_program,
+            feed={feed_target_names[0]: tensor_img},
+            fetch_list=fetch_targets,
+        )
+        self.assertIsNotNone(results)
+        paddle.disable_static()
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/quantization/test_weight_only_linear.py b/test/quantization/test_weight_only_linear.py
index 81f84f138e70b8..c7bbc1c6582676 100644
--- a/test/quantization/test_weight_only_linear.py
+++ b/test/quantization/test_weight_only_linear.py
@@ -399,5 +399,47 @@ def test_weightonly_linear_backward(self):
         np.testing.assert_allclose(quant_x.grad, x.grad, rtol=1e-3, atol=1e-3)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinearTestCase11(WeightOnlyLinearTestCase):
+    def config(self):
+        super().config()
+        self.dtype = 'float16'
+        self.weight_dtype = "int8"
+        self.in_features = 128
+        self.out_features = 288
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or get_cuda_version() < 11020,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinearTestCase12(WeightOnlyLinearTestCase):
+    def config(self):
+        super().config()
+        self.dtype = 'float16'
+        self.bias = False
+        self.weight_dtype = "int8"
+        self.in_features = 128
+        self.out_features = 288
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or get_cuda_version() < 11020
+    or paddle.device.cuda.get_device_capability()[0] < 8,
+    "quantized_matmul requires CUDA >= 11.2 and CUDA_ARCH >= 8",
+)
+class WeightOnlyLinearTestCase13(WeightOnlyLinearTestCase):
+    def config(self):
+        super().config()
+        self.dtype = 'bfloat16'
+        self.weight_dtype = "int8"
+        self.in_features = 128
+        self.out_features = 288
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/xpu/test_parallel_dygraph_dataparallel.py b/test/xpu/test_parallel_dygraph_dataparallel.py
index 9f56e22d06d9ad..0070f8ade98024 100644
--- a/test/xpu/test_parallel_dygraph_dataparallel.py
+++ b/test/xpu/test_parallel_dygraph_dataparallel.py
@@ -62,7 +62,7 @@ def start_local_trainers(
     log_dir=None,
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast mcclUniqueId use socket, and
+    # paddle broadcast ncclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/third_party/cryptopp b/third_party/cryptopp
new file mode 160000
index 00000000000000..9dcc26c58213ab
--- /dev/null
+++ b/third_party/cryptopp
@@ -0,0 +1 @@
+Subproject commit 9dcc26c58213abb8351fbb1b2a7a1d2c667366e4
diff --git a/third_party/cryptopp-cmake b/third_party/cryptopp-cmake
new file mode 160000
index 00000000000000..6d0666c457fbbf
--- /dev/null
+++ b/third_party/cryptopp-cmake
@@ -0,0 +1 @@
+Subproject commit 6d0666c457fbbf6f81819fd2b80f0cb5b6646593
diff --git a/tools/enforce/grep_invalid_enforce.sh b/tools/enforce/grep_invalid_enforce.sh
index c12f9b9481c62b..04243bfb9afaf0 100644
--- a/tools/enforce/grep_invalid_enforce.sh
+++ b/tools/enforce/grep_invalid_enforce.sh
@@ -39,7 +39,7 @@
 #         PADDLE_THROW("binding failed on ep: %s", ep);
 #         PADDLE_THROW("listen on server fd failed");
 #         PADDLE_THROW("accept the new socket fd failed");
-#         PADDLE_THROW("reading the mcclUniqueId from socket failed");
+#         PADDLE_THROW("reading the ncclUniqueId from socket failed");
 #         PADDLE_ENFORCE_EQ(addr.size(), 2UL,
 #                             "The endpoint should contain host and port: %s", ep);
 #         PADDLE_THROW("create socket failed");
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 464fb9cc1cfe46..cbc97375fd869d 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -28,7 +28,6 @@
     'test_fc_gru_fuse_pass_cc',
     'device_worker_test',
     'test_custom_conj',
-    'infer_io_utils_tester',
     'test_transpose_bf16_mkldnn_op',
     'test_container',
     'cpu_helper_test',
@@ -73,7 +72,6 @@
     'test_pybind_interface',
     'test_io_save_load',
     'test_fusion_lstm_int8_mkldnn_op',
-    'test_benchmark',
     'test_protobuf',
     'test_tdm_sampler_op',
     'test_teacher_student_sigmoid_loss_op',
@@ -482,7 +480,6 @@
     'test_communicator_half_async',
     'test_dynrnn_gradient_check',
     'test_pool2d_bf16_mkldnn_op',
-    'test_table_printer',
     'test_framework_debug_str',
     'test_dist_fleet_ps2',
     'test_collective_scatter_api',
@@ -1926,7 +1923,6 @@
     'test_bpr_loss_op',
     'test_boxps',
     'test_bipartite_match_op',
-    'test_benchmark',
     'test_beam_search_op',
     'test_batch_sampler',
     'test_batch_norm_act_fuse_pass',
@@ -1970,7 +1966,6 @@
     'lodtensor_printer_test',
     'test_dispatch_jit',
     'inlined_vector_test',
-    'infer_io_utils_tester',
     'graph_to_program_pass_test',
     'graph_test',
     'graph_helper_test',
@@ -2176,7 +2171,6 @@
     'test_auto_parallel_api',
     'test_tensor_copy_from',
     'test_analyzer_capi_exp_xpu',
-    'test_table_printer',
     'test_egr_task_autocodegen',
     'test_static_save_load_bf16',
     'test_parallel_executor_run_cinn',

From 815c33f46e1e092258db929553c93fc36cf35039 Mon Sep 17 00:00:00 2001
From: "haowen.han@mthreads.com" <haowen.han@mthreads.com>
Date: Fri, 19 Apr 2024 19:46:51 +0800
Subject: [PATCH 2/6] update to v2.6.0

---
 CMakeLists.txt                                |   63 +-
 cmake/configure.cmake                         |   13 +
 cmake/cupti.cmake                             |    6 +-
 cmake/external/eigen.cmake                    |   70 +
 cmake/flags.cmake                             |    5 +
 cmake/generic.cmake                           |  133 ++
 cmake/mccl.cmake                              |   51 +
 cmake/mudnn.cmake                             |   92 +
 cmake/musa.cmake                              |  128 ++
 cmake/operators.cmake                         |   73 +-
 cmake/phi.cmake                               |    2 +-
 paddle/common/array.h                         |    8 +-
 paddle/common/hostdevice.h                    |    6 +-
 paddle/common/macros.h                        |    2 +-
 .../distributed/collective/CMakeLists.txt     |    4 +-
 .../collective/process_group_nccl.cc          |   30 +-
 .../collective/process_group_nccl.h           |    2 +-
 .../collective/processgroup_comm_utils.cc     |    6 +-
 .../fluid/distributed/collective/reducer.cc   |    6 +-
 .../distributed/fleet_executor/carrier.cc     |    2 +-
 .../fleet_executor/cond_interceptor.cc        |    2 +-
 .../distributed/fleet_executor/dist_model.cc  |    2 +-
 .../distributed/fleet_executor/message_bus.cc |    2 +-
 .../eager/auto_code_generator/CMakeLists.txt  |    4 +
 .../generator/python_c_gen.py                 |    2 +-
 paddle/fluid/eager/nan_inf_utils.cc           |    2 +-
 paddle/fluid/framework/CMakeLists.txt         |    7 +-
 paddle/fluid/framework/conv_search_cache.h    |   18 +
 paddle/fluid/framework/custom_operator.cc     |    4 +-
 paddle/fluid/framework/data_feed.cc           |    4 +-
 paddle/fluid/framework/data_feed.cu           |   40 +-
 paddle/fluid/framework/data_feed.h            |    2 +-
 paddle/fluid/framework/data_feed_factory.cc   |    2 +-
 paddle/fluid/framework/data_type_transform.cc |    2 +-
 paddle/fluid/framework/details/CMakeLists.txt |   71 +-
 .../framework/details/all_reduce_op_handle.cc |   12 +-
 .../framework/details/all_reduce_op_handle.h  |    8 +-
 .../framework/details/broadcast_op_handle.cc  |    6 +-
 .../framework/details/broadcast_op_handle.h   |    8 +-
 .../fluid/framework/details/build_strategy.cc |   12 +-
 .../fluid/framework/details/build_strategy.h  |    2 +-
 .../details/eager_deletion_op_handle.cc       |   21 +-
 .../details/eager_deletion_op_handle.h        |    2 +-
 .../details/fetch_async_op_handle.cc          |    2 +-
 .../framework/details/fetch_op_handle.cc      |    2 +-
 .../details/fused_all_reduce_op_handle.cc     |   21 +-
 .../details/fused_all_reduce_op_handle.h      |    6 +-
 .../details/fused_broadcast_op_handle.h       |    4 +-
 .../grad_merge_all_reduce_op_handle.cc        |    6 +-
 .../details/grad_merge_all_reduce_op_handle.h |    6 +-
 .../framework/details/nan_inf_utils_detail.cc |    2 +-
 .../fluid/framework/details/nccl_op_handle.h  |   61 +-
 .../fluid/framework/details/op_handle_base.cc |   34 +-
 .../fluid/framework/details/op_handle_base.h  |    2 +-
 .../framework/details/reduce_op_handle.cc     |    8 +-
 .../framework/details/reduce_op_handle.h      |    6 +-
 .../details/scale_loss_grad_op_handle.cc      |    4 +-
 .../details/share_tensor_buffer_op_handle.cc  |    2 +-
 .../details/sparse_all_reduce_op_handle.cc    |    6 +-
 paddle/fluid/framework/details/var_handle.h   |    4 +-
 paddle/fluid/framework/device_worker.h        |   20 +-
 .../fluid/framework/device_worker_factory.cc  |    4 +-
 paddle/fluid/framework/dlpack_tensor.cc       |    4 +-
 paddle/fluid/framework/fleet/CMakeLists.txt   |   14 +-
 paddle/fluid/framework/fleet/box_wrapper.cu   |   22 +
 paddle/fluid/framework/fleet/box_wrapper.h    |    3 +
 .../fluid/framework/fleet/box_wrapper_impl.h  |   13 +-
 paddle/fluid/framework/fleet/fleet_wrapper.cc |    5 +-
 paddle/fluid/framework/fleet/fleet_wrapper.h  |    2 +-
 .../framework/fleet/heter_ps/CMakeLists.txt   |   15 +
 .../fleet/heter_ps/graph_gpu_wrapper.cu       |    6 +-
 .../fleet/heter_ps/graph_gpu_wrapper.h        |    8 +-
 .../framework/fleet/heter_ps/heter_comm.h     |    8 +-
 .../framework/fleet/heter_ps/heter_comm_inl.h |   16 +-
 .../framework/fleet/heter_ps/heter_ps.cu      |    4 +-
 .../fluid/framework/fleet/heter_ps/heter_ps.h |    4 +-
 .../framework/fleet/heter_ps/heter_ps_base.h  |    4 +-
 paddle/fluid/framework/fleet/heter_wrapper.cc |    6 +-
 paddle/fluid/framework/fleet/heter_wrapper.h  |    2 +-
 paddle/fluid/framework/fleet/nccl_wrapper.cc  |   22 +-
 paddle/fluid/framework/fleet/nccl_wrapper.h   |   10 +-
 paddle/fluid/framework/fleet/ps_gpu_wrapper.h |   12 +-
 paddle/fluid/framework/garbage_collector.cc   |    8 +-
 paddle/fluid/framework/garbage_collector.h    |    2 +-
 paddle/fluid/framework/hogwild_worker.cc      |   18 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |    8 +-
 paddle/fluid/framework/ir/cost_model.cc       |    4 +-
 paddle/fluid/framework/ir/fuse_bn_act_pass.cc |    4 +-
 .../framework/ir/fuse_bn_add_act_pass.cc      |    4 +-
 .../framework/ir/fusion_group/CMakeLists.txt  |    2 +-
 .../ir/fusion_group/code_generator_tester.cc  |    2 +-
 .../ir/fusion_group/cuda_resources.h          |    2 +-
 paddle/fluid/framework/ir/graph_helper.cc     |    6 +-
 ...est_reference_count_pass_last_lived_ops.cc |    2 +-
 .../all_reduce_deps_pass.cc                   |    2 +-
 .../fuse_all_reduce_op_pass.cc                |   16 +-
 .../multi_devices_graph_pass.cc               |   18 +-
 .../multi_devices_graph_pass.h                |    4 +-
 .../instruction/instruction_util.cc           |    6 +-
 .../interpreter/execution_config.cc           |    2 +-
 .../interpreter/interpreter_util.cc           |    2 +-
 .../interpreter/stream_analyzer.cc            |    4 +-
 .../new_executor/interpreter_base_impl.h      |    4 +-
 .../new_executor/new_executor_defs.cc         |    4 +-
 .../new_executor/new_executor_defs.h          |    4 +-
 .../framework/new_executor/pir_interpreter.cc |   10 +-
 .../fluid/framework/new_executor/profiler.h   |    2 +-
 .../new_executor/program_interpreter.cc       |   22 +-
 .../new_executor/program_interpreter.h        |    4 +-
 paddle/fluid/framework/op_registry.h          |    4 +-
 paddle/fluid/framework/operator.cc            |   18 +-
 paddle/fluid/framework/operator.h             |    2 +-
 paddle/fluid/framework/parallel_executor.cc   |   60 +-
 paddle/fluid/framework/parallel_executor.h    |    2 +-
 paddle/fluid/framework/phi_utils.cc           |    2 +-
 paddle/fluid/framework/phi_utils.h            |    2 +-
 paddle/fluid/framework/pipeline_trainer.cc    |    4 +-
 paddle/fluid/framework/ps_gpu_trainer.cc      |    2 +-
 paddle/fluid/framework/ps_gpu_worker.cc       |    6 +-
 paddle/fluid/framework/pull_dense_worker.cc   |   14 +-
 paddle/fluid/framework/section_worker.cc      |    4 +-
 paddle/fluid/framework/tensor_util.cc         |   14 +-
 paddle/fluid/framework/tensor_util.h          |    8 +-
 paddle/fluid/framework/trainer.h              |   12 +-
 paddle/fluid/framework/trainer_factory.cc     |    6 +-
 paddle/fluid/framework/var_type_traits.cc     |    7 +
 paddle/fluid/framework/var_type_traits.h      |   20 +-
 paddle/fluid/imperative/CMakeLists.txt        |   11 +-
 paddle/fluid/imperative/all_reduce.cc         |   37 +-
 paddle/fluid/imperative/all_reduce.h          |    2 +-
 paddle/fluid/imperative/amp_auto_cast.cc      |    2 +-
 paddle/fluid/imperative/gloo_context.cc       |    2 +-
 .../fluid/imperative/gradient_accumulator.cc  |   18 +-
 paddle/fluid/imperative/nccl_context.cc       |   22 +-
 paddle/fluid/imperative/nccl_context.h        |   10 +-
 paddle/fluid/imperative/prepared_operator.cc  |    6 +-
 paddle/fluid/imperative/reducer.cc            |   10 +-
 paddle/fluid/imperative/reducer.cu            |    2 +-
 paddle/fluid/imperative/reducer.h             |    2 +-
 paddle/fluid/imperative/tracer.cc             |    6 +-
 .../ir_params_sync_among_devices_pass.cc      |    4 +-
 .../ir_params_sync_among_devices_pass.h       |    2 +-
 paddle/fluid/inference/api/analysis_config.cc |   10 +-
 .../fluid/inference/api/analysis_predictor.cc |   35 +-
 .../fluid/inference/api/analysis_predictor.h  |    2 +-
 paddle/fluid/inference/api/api_impl.cc        |    2 +-
 .../inference/api/details/zero_copy_tensor.cc |   21 +-
 paddle/fluid/inference/api/infer_context.cc   |    2 +-
 paddle/fluid/inference/api/infer_context.h    |    4 +-
 paddle/fluid/inference/api/paddle_api.h       |    3 +
 .../inference/api/paddle_pass_builder.cc      |    5 +-
 .../fluid/inference/api/resource_manager.cc   |   79 +-
 paddle/fluid/inference/api/resource_manager.h |   26 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |    2 +-
 .../tensorrt/plugin/c_allreduce_op_plugin.cu  |   28 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |    3 +
 paddle/fluid/memory/CMakeLists.txt            |   11 +
 paddle/fluid/memory/allocation/CMakeLists.txt |   14 +-
 paddle/fluid/memory/allocation/allocator.h    |   14 +-
 .../memory/allocation/allocator_facade.cc     |   32 +-
 .../memory/allocation/allocator_facade.h      |    2 +-
 .../memory/allocation/buddy_allocator.cc      |    6 +-
 .../fluid/memory/allocation/cuda_allocator.cc |    4 +
 .../cuda_device_context_allocator.h           |    9 +-
 .../allocation/cuda_managed_allocator.cc      |    5 +
 .../allocation/naive_best_fit_allocator.cc    |   28 +-
 .../memory/allocation/pinned_allocator.cc     |    4 +
 .../allocation/stream_safe_cuda_allocator.cc  |   16 +
 .../allocation/stream_safe_cuda_allocator.h   |    3 +
 .../memory/allocation/system_allocator.cc     |   22 +-
 .../memory/allocation/system_allocator.h      |    2 +-
 paddle/fluid/memory/malloc.cc                 |    2 +-
 paddle/fluid/memory/malloc.h                  |    2 +-
 paddle/fluid/memory/memcpy.cc                 |   83 +-
 paddle/fluid/operators/CMakeLists.txt         |   12 +-
 paddle/fluid/operators/affine_channel_op.cu   |    2 +-
 .../fluid/operators/array_to_lod_tensor_op.cc |    2 +-
 paddle/fluid/operators/batch_norm_op.cu       |    2 +-
 .../fluid/operators/class_center_sample_op.cu |   23 +-
 .../fluid/operators/collective/CMakeLists.txt |    2 +-
 .../operators/collective/alltoall_op.cu.cc    |   10 +-
 .../operators/collective/barrier_op.cu.cc     |   12 +-
 .../operators/collective/c_allgather_op.cu.cc |   10 +-
 .../collective/c_allreduce_max_op.cu.cc       |    4 +-
 .../operators/collective/c_allreduce_op.h     |   20 +-
 .../collective/c_allreduce_sum_op.cu.cc       |    4 +-
 .../operators/collective/c_broadcast_op.cu.cc |   14 +-
 .../collective/c_comm_init_all_op.cc          |    4 +-
 .../collective/c_comm_init_multitrainer_op.cc |   10 +-
 .../operators/collective/c_comm_init_op.cc    |   17 +-
 .../operators/collective/c_concat_op.cu.cc    |   14 +-
 .../operators/collective/c_gen_nccl_id_op.cc  |   14 +-
 .../fluid/operators/collective/c_reduce_op.h  |   18 +-
 .../collective/c_reducescatter_op.cu.cc       |   14 +-
 .../operators/collective/c_scatter_op.cu.cc   |   10 +-
 .../c_softmax_with_cross_entropy_op.cu        |   20 +-
 .../collective/c_sync_calc_stream_op.h        |    2 +-
 .../collective/c_sync_comm_stream_op.h        |    6 +-
 .../operators/collective/c_wait_comm_op.cc    |    7 +-
 .../operators/collective/c_wait_compute_op.cc |    7 +-
 .../operators/collective/gen_nccl_id_op.cc    |   14 +-
 .../collective/global_gather_op.cu.cc         |   34 +-
 .../collective/global_scatter_op.cu.cc        |   34 +-
 .../collective/mp_allreduce_sum_op.cu.cc      |    4 +-
 .../collective/partial_allgather_op.cu.cc     |   14 +-
 .../collective/partial_recv_op.cu.cc          |   12 +-
 .../collective/partial_send_op.cu.cc          |   12 +-
 .../operators/collective/recv_v2_op.cu.cc     |   16 +-
 .../operators/collective/send_v2_op.cu.cc     |   18 +-
 .../controlflow/conditional_block_op.h        |    2 +-
 paddle/fluid/operators/controlflow/feed_op.cc |    2 +-
 .../operators/controlflow/get_places_op.cc    |    4 +-
 .../operators/controlflow/while_op_helper.cc  |    2 +-
 paddle/fluid/operators/data_norm_op.cu        |   28 +-
 .../fluid/operators/detection/CMakeLists.txt  |    4 +-
 .../fluid/operators/detection/bbox_util.cu.h  |    2 +-
 .../detection/collect_fpn_proposals_op.cu     |    2 +-
 .../elementwise/elementwise_op_function.h     |   19 +-
 paddle/fluid/operators/expand_op.cc           |    2 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |    2 +
 paddle/fluid/operators/fused/CMakeLists.txt   |   12 +-
 .../fluid/operators/fused/attn_bias_add.cu.h  |    2 +-
 .../operators/fused/fused_attention_utils.h   |   10 +-
 .../operators/fused/fused_dropout_common.h    |    4 +-
 .../fused/fused_multi_transformer_op.cu.h     |   12 +-
 .../operators/fused/fused_seqpool_cvm_op.cu   |   59 +
 .../fluid/operators/fused/yolo_box_post_op.cu |   39 +
 .../get_tensor_from_selected_rows_op.cc       |    2 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |   11 +
 .../operators/grid_sampler_cudnn_op.cu.cc     |    2 +-
 paddle/fluid/operators/hinge_loss_op.cc       |    2 +-
 paddle/fluid/operators/im2sequence_op.cc      |    2 +-
 paddle/fluid/operators/isfinite_op.h          |    8 +-
 paddle/fluid/operators/l1_norm_op.cc          |    2 +-
 paddle/fluid/operators/load_op.cc             |    2 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |    2 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |    3 +
 .../operators/margin_cross_entropy_op.cu      |   38 +-
 .../operators/math/bert_encoder_functor.h     |    8 +-
 paddle/fluid/operators/math/gru_compute.cc    |    8 +-
 paddle/fluid/operators/math/inclusive_scan.h  |    2 +-
 paddle/fluid/operators/math/prelu.h           |    2 +-
 paddle/fluid/operators/math/sample_prob.cu    |    5 +
 paddle/fluid/operators/math/sample_prob.h     |    2 +-
 paddle/fluid/operators/matmul_op.cc           |   12 +-
 paddle/fluid/operators/memcpy_h2d_op.h        |    2 +-
 paddle/fluid/operators/merge_lod_tensor_op.cc |    2 +-
 paddle/fluid/operators/minus_op.cc            |    2 +-
 paddle/fluid/operators/nccl/CMakeLists.txt    |   11 +-
 .../fluid/operators/nccl/nccl_gpu_common.cc   |   10 +-
 paddle/fluid/operators/nccl/nccl_gpu_common.h |    4 +-
 paddle/fluid/operators/nccl/nccl_op.cc        |   24 +-
 paddle/fluid/operators/nccl/nccl_op.cu.cc     |   34 +-
 .../optimizers/distributed_fused_lamb_op.cu   |  188 +-
 .../operators/optimizers/sparse_momentum_op.h |    5 +-
 .../operators/pscore/send_and_recv_op.cc      |    2 +-
 paddle/fluid/operators/rank_loss_op.cc        |    2 +-
 .../fluid/operators/reader/buffered_reader.cc |    9 +-
 .../fluid/operators/reader/buffered_reader.h  |    4 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |    4 +-
 paddle/fluid/operators/reshape_op.cc          |    8 +-
 paddle/fluid/operators/save_op.cc             |    2 +-
 paddle/fluid/operators/select_op_helper.h     |    2 +-
 .../sequence_ops/sequence_reverse_op.h        |    4 +-
 .../sequence_softmax_cudnn_op.cu.cc           |   69 +-
 .../sequence_ops/sequence_softmax_op.cc       |    2 +-
 .../sequence_ops/sequence_softmax_op.cu       |    4 +
 paddle/fluid/operators/split_lod_tensor_op.cc |    2 +-
 paddle/fluid/operators/svd_helper.h           |    2 +-
 paddle/fluid/operators/sync_batch_norm_op.cu  |   10 +-
 .../fluid/operators/sync_batch_norm_utils.h   |   15 +-
 paddle/fluid/operators/top_k_op.cu            |    3 +-
 paddle/fluid/operators/uniform_random_op.h    |    4 +-
 paddle/fluid/platform/CMakeLists.txt          |   66 +-
 paddle/fluid/platform/collective_helper.cc    |   32 +-
 paddle/fluid/platform/collective_helper.h     |   14 +-
 paddle/fluid/platform/device/CMakeLists.txt   |    2 +-
 paddle/fluid/platform/device/device_wrapper.h |    2 +-
 .../fluid/platform/device/gpu/CMakeLists.txt  |   12 +
 paddle/fluid/platform/device/gpu/gpu_helper.h |    4 +-
 paddle/fluid/platform/device/gpu/gpu_info.cc  |   13 +
 paddle/fluid/platform/device/gpu/gpu_info.h   |    2 +-
 .../platform/device/gpu/gpu_launch_config.h   |    4 +-
 .../platform/device/gpu/gpu_resource_pool.cc  |   12 +-
 .../platform/device/gpu/gpu_resource_pool.h   |    7 +-
 paddle/fluid/platform/device/gpu/gpu_types.h  |  124 +-
 .../platform/device/gpu/musa/musa_helper.h    |  104 ++
 .../fluid/platform/device/gpu/nccl_helper.h   |   86 +-
 paddle/fluid/platform/device_context.cc       |   10 +-
 paddle/fluid/platform/device_context.h        |   16 +-
 paddle/fluid/platform/device_event.h          |    2 +-
 paddle/fluid/platform/device_event_base.cc    |    8 +
 paddle/fluid/platform/device_event_gpu.cc     |    2 +-
 paddle/fluid/platform/dynload/CMakeLists.txt  |   22 +
 .../fluid/platform/dynload/dynamic_loader.h   |    1 +
 paddle/fluid/platform/dynload/mccl.cc         |   43 +
 paddle/fluid/platform/dynload/mccl.h          |   51 +
 paddle/fluid/platform/dynload/mublas.cc       |   38 +
 paddle/fluid/platform/dynload/mublas.h        |   55 +
 paddle/fluid/platform/dynload/mudnn.cc        |   30 +
 paddle/fluid/platform/dynload/mudnn.h         |   39 +
 paddle/fluid/platform/dynload/mufft.cc        |   30 +
 paddle/fluid/platform/dynload/mufft.h         |   93 +
 paddle/fluid/platform/dynload/murand.cc       |   27 +
 paddle/fluid/platform/dynload/murand.h        |   43 +
 paddle/fluid/platform/dynload/musa_driver.cc  |   31 +
 paddle/fluid/platform/dynload/musa_driver.h   |   58 +
 paddle/fluid/platform/dynload/musartc.cc      |   31 +
 paddle/fluid/platform/dynload/musartc.h       |   51 +
 paddle/fluid/platform/dynload/musparse.cc     |   30 +
 paddle/fluid/platform/dynload/musparse.h      |   41 +
 paddle/fluid/platform/dynload/nccl.cc         |   16 +-
 paddle/fluid/platform/dynload/nccl.h          |   30 +-
 paddle/fluid/platform/dynload/rccl.cc         |   16 +-
 paddle/fluid/platform/dynload/rccl.h          |   14 +-
 paddle/fluid/platform/enforce.h               |   26 +-
 paddle/fluid/platform/enforce_test.cc         |    4 +-
 paddle/fluid/platform/event.h                 |    5 +
 paddle/fluid/platform/gen_comm_id_helper.cc   |    6 +-
 paddle/fluid/platform/gen_comm_id_helper.h    |    2 +-
 paddle/fluid/platform/init.cc                 |   20 +-
 paddle/fluid/platform/place.h                 |    4 +-
 paddle/fluid/platform/profiler.cc             |    2 +-
 paddle/fluid/platform/profiler.cu             |   19 +
 paddle/fluid/platform/profiler.h              |    4 +-
 .../platform/profiler/chrometracing_logger.cc |   40 +-
 .../platform/profiler/chrometracing_logger.h  |    2 +-
 .../profiler/dump/deserialization_reader.cc   |    4 +-
 .../profiler/dump/deserialization_reader.h    |    2 +-
 .../profiler/dump/serialization_logger.cc     |    2 +-
 .../profiler/dump/serialization_logger.h      |    2 +-
 .../fluid/platform/profiler/event_python.cc   |    6 +-
 paddle/fluid/platform/profiler/event_python.h |    6 +-
 paddle/fluid/platform/profiler/profiler.cc    |   13 +-
 .../fluid/platform/profiler/profiler_test.cc  |    8 +
 paddle/fluid/platform/profiler_helper.h       |   19 +-
 .../fluid/platform/stream_callback_manager.cc |   15 +-
 .../fluid/platform/stream_callback_manager.h  |    5 +
 paddle/fluid/pybind/CMakeLists.txt            |   28 +-
 paddle/fluid/pybind/communication.cc          |    2 +-
 paddle/fluid/pybind/cuda_streams_py.cc        |   22 +-
 paddle/fluid/pybind/cuda_streams_py.h         |    4 +-
 paddle/fluid/pybind/distributed_py.cc         |    4 +-
 paddle/fluid/pybind/eager.cc                  |    2 +-
 paddle/fluid/pybind/eager_functions.cc        |    2 +-
 paddle/fluid/pybind/eager_math_op_patch.cc    |    2 +-
 paddle/fluid/pybind/eager_method.cc           |    4 +-
 paddle/fluid/pybind/generator_py.cc           |    2 +-
 paddle/fluid/pybind/imperative.cc             |    6 +-
 paddle/fluid/pybind/inference_api.cc          |   12 +-
 paddle/fluid/pybind/parallel_executor.cc      |    8 +-
 paddle/fluid/pybind/place.cc                  |   20 +-
 paddle/fluid/pybind/process_group_utils.h     |    4 +-
 paddle/fluid/pybind/pybind.cc                 |   59 +-
 paddle/fluid/pybind/tensor.cc                 |   10 +-
 paddle/fluid/pybind/tensor_py.h               |   17 +-
 paddle/phi/CMakeLists.txt                     |   17 +-
 paddle/phi/api/include/context_pool.h         |    2 +-
 paddle/phi/api/include/tensor.h               |    7 +-
 paddle/phi/api/lib/api_gen_utils.cc           |    6 +-
 paddle/phi/api/lib/context_pool.cc            |    4 +-
 paddle/phi/api/lib/data_transform.cc          |    8 +-
 paddle/phi/api/lib/tensor.cc                  |    2 +-
 paddle/phi/api/lib/tensor_utils.cc            |   40 +-
 paddle/phi/api/profiler/event.h               |   32 +-
 paddle/phi/backends/CMakeLists.txt            |    6 +-
 paddle/phi/backends/context_pool.cc           |    2 +-
 paddle/phi/backends/context_pool.h            |    4 +-
 paddle/phi/backends/device_code.cc            |  144 +-
 paddle/phi/backends/device_code.h             |   16 +-
 paddle/phi/backends/device_memory_aligment.h  |    2 +-
 paddle/phi/backends/dynload/CMakeLists.txt    |   22 +
 paddle/phi/backends/dynload/dynamic_loader.cc |   48 +
 paddle/phi/backends/dynload/dynamic_loader.h  |    1 +
 paddle/phi/backends/dynload/mccl.cc           |   36 +
 paddle/phi/backends/dynload/mccl.h            |   80 +
 paddle/phi/backends/dynload/mublas.cc         |   38 +
 paddle/phi/backends/dynload/mublas.h          |  128 ++
 paddle/phi/backends/dynload/mudnn.cc          |   41 +
 paddle/phi/backends/dynload/mudnn.h           |   41 +
 paddle/phi/backends/dynload/mufft.cc          |   43 +
 paddle/phi/backends/dynload/mufft.h           |  155 ++
 paddle/phi/backends/dynload/murand.cc         |   28 +
 paddle/phi/backends/dynload/murand.h          |   54 +
 paddle/phi/backends/dynload/musa_driver.cc    |   33 +
 paddle/phi/backends/dynload/musa_driver.h     |   69 +
 paddle/phi/backends/dynload/musartc.cc        |   34 +
 paddle/phi/backends/dynload/musartc.h         |  147 ++
 paddle/phi/backends/dynload/musparse.cc       |   29 +
 paddle/phi/backends/dynload/musparse.h        |   76 +
 paddle/phi/backends/dynload/nccl.h            |   14 +-
 paddle/phi/backends/dynload/rccl.h            |   14 +-
 paddle/phi/backends/gpu/forwards.h            |   19 +
 paddle/phi/backends/gpu/gpu_context.cc        |  176 +-
 paddle/phi/backends/gpu/gpu_context.h         |   22 +-
 paddle/phi/backends/gpu/gpu_decls.h           |   81 +-
 paddle/phi/backends/gpu/gpu_device_function.h |    4 +-
 paddle/phi/backends/gpu/gpu_dnn.h             |    5 +-
 paddle/phi/backends/gpu/gpu_helper.h          |    4 +-
 paddle/phi/backends/gpu/gpu_info.h            |    2 +-
 paddle/phi/backends/gpu/gpu_launch_config.h   |    4 +-
 paddle/phi/backends/gpu/gpu_primitives.h      |   10 +-
 paddle/phi/backends/gpu/gpu_resources.cc      |  175 +-
 paddle/phi/backends/gpu/gpu_resources.h       |    8 +-
 paddle/phi/backends/gpu/gpu_types.h           |   70 +-
 paddle/phi/backends/gpu/musa/mudnn_desc.h     |  202 +++
 paddle/phi/backends/gpu/musa/mudnn_helper.h   |  323 ++++
 .../backends/gpu/musa/musa_device_function.h  |  193 ++
 paddle/phi/backends/gpu/musa/musa_helper.h    |   74 +
 paddle/phi/backends/gpu/musa/musa_info.cc     |  344 ++++
 paddle/phi/capi/lib/c_device_context.cc       |    2 +-
 paddle/phi/capi/lib/c_kernel_context.cc       |    2 +-
 paddle/phi/common/backend.h                   |    2 +-
 paddle/phi/common/bfloat16.h                  |   40 +-
 paddle/phi/common/complex.h                   |   19 +-
 paddle/phi/common/cpstring_impl.h             |    6 +-
 paddle/phi/common/float16.h                   |   53 +-
 paddle/phi/common/memory_utils.cc             |    6 +-
 paddle/phi/common/memory_utils.h              |   23 +-
 paddle/phi/common/place.cc                    |    4 +-
 paddle/phi/common/transform.h                 |   17 +-
 paddle/phi/core/compat/convert_utils.cc       |    6 +-
 paddle/phi/core/cuda_stream.h                 |   22 +
 paddle/phi/core/distributed/CMakeLists.txt    |    2 +-
 .../auto_parallel/reshard/reshard_utils.cc    |    4 +-
 .../auto_parallel/reshard/reshard_utils.h     |    4 +-
 .../phi/core/distributed/check/CMakeLists.txt |    2 +-
 .../distributed/check/nccl_dynamic_check.cc   |   38 +-
 .../distributed/check/nccl_dynamic_check.h    |   10 +-
 .../core/distributed/comm_context_manager.cc  |   16 +-
 .../core/distributed/comm_context_manager.h   |    8 +-
 paddle/phi/core/distributed/comm_task.h       |    9 +-
 .../phi/core/distributed/comm_task_manager.cc |    2 +-
 .../phi/core/distributed/nccl_comm_context.cc |   50 +-
 .../phi/core/distributed/nccl_comm_context.h  |   31 +-
 paddle/phi/core/distributed/nccl_comm_task.cc |   55 +-
 paddle/phi/core/distributed/nccl_comm_task.h  |    6 +-
 paddle/phi/core/distributed/nccl_tools.cc     |   76 +-
 paddle/phi/core/distributed/nccl_tools.h      |   36 +-
 paddle/phi/core/enforce.h                     |  272 ++-
 paddle/phi/core/flags.cc                      |   22 +-
 paddle/phi/core/generator.cc                  |    5 +-
 paddle/phi/core/hostdevice.h                  |    6 +-
 paddle/phi/core/kernel_factory.cc             |    4 +-
 paddle/phi/core/kernel_registry.cc            |    2 +-
 paddle/phi/core/kernel_registry.h             |    2 +-
 paddle/phi/core/kernel_utils.h                |    2 +-
 paddle/phi/core/mixed_vector.cc               |    4 +-
 paddle/phi/core/string_tensor.cc              |    4 +-
 paddle/phi/core/tensor_utils.cc               |   16 +-
 paddle/phi/core/utils/data_type.h             |   29 +-
 paddle/phi/core/utils/type_info.cc            |    4 +-
 paddle/phi/core/utils/visit_place.h           |    4 +-
 paddle/phi/core/visit_type.h                  |    2 +-
 paddle/phi/infermeta/multiary.cc              |    2 +-
 paddle/phi/kernels/CMakeLists.txt             |   46 +-
 paddle/phi/kernels/array_kernel.cc            |    8 +-
 paddle/phi/kernels/assign_kernel.cc           |    2 +-
 paddle/phi/kernels/autotune/gpu_timer.h       |   39 +-
 paddle/phi/kernels/batch_norm_kernel.cc       |    2 +-
 .../kernels/check_memory_continue_kernel.cc   |    2 +-
 paddle/phi/kernels/coalesce_tensor_kernel.cc  |   14 +
 paddle/phi/kernels/cpu/decode_jpeg_kernel.cc  |    2 +-
 paddle/phi/kernels/cpu/gelu_grad_kernel.cc    |    2 +-
 paddle/phi/kernels/cpu/gelu_kernel.cc         |    2 +-
 paddle/phi/kernels/dist_grad_kernel.cc        |    2 +-
 paddle/phi/kernels/empty_kernel.cc            |    2 +-
 paddle/phi/kernels/flatten_grad_kernel.cc     |    2 +-
 paddle/phi/kernels/flatten_kernel.cc          |    2 +-
 paddle/phi/kernels/full_kernel.cc             |    2 +-
 paddle/phi/kernels/funcs/CMakeLists.txt       |    8 +-
 paddle/phi/kernels/funcs/activation_functor.h |    2 +-
 paddle/phi/kernels/funcs/algorithm.h          |    4 +-
 paddle/phi/kernels/funcs/blas/blas.h          |   14 +-
 paddle/phi/kernels/funcs/blas/blas_impl.h     |    4 +-
 paddle/phi/kernels/funcs/blas/blas_impl.mu.h  | 1602 +++++++++++++++++
 paddle/phi/kernels/funcs/broadcast_function.h |    4 +-
 .../phi/kernels/funcs/check_numerics_utils.h  |    2 +-
 .../kernels/funcs/concat_and_split_functor.cu |    2 +-
 .../phi/kernels/funcs/detail/gru_cpu_kernel.h |    2 +-
 .../phi/kernels/funcs/detail/gru_gpu_kernel.h |    4 +-
 paddle/phi/kernels/funcs/detail/gru_kernel.h  |   10 +-
 .../kernels/funcs/detail/lstm_cpu_kernel.h    |    2 +-
 paddle/phi/kernels/funcs/detail/lstm_kernel.h |    4 +-
 .../phi/kernels/funcs/detail/strided_memcpy.h |    6 +-
 paddle/phi/kernels/funcs/diagonal.h           |    6 +-
 .../phi/kernels/funcs/distribution_helper.h   |   48 +-
 paddle/phi/kernels/funcs/dropout_impl.cu.h    |   23 +-
 paddle/phi/kernels/funcs/elementwise_base.h   |    6 +-
 .../phi/kernels/funcs/elementwise_functor.h   |    2 +-
 .../phi/kernels/funcs/elementwise_grad_base.h |    4 +-
 .../funcs/emb_eltwise_layer_norm_functor.cu   |    7 +-
 paddle/phi/kernels/funcs/fc_functor.cu        |    6 +-
 paddle/phi/kernels/funcs/fft.cu               |    7 +-
 paddle/phi/kernels/funcs/fft_cache.h          |    2 +
 paddle/phi/kernels/funcs/fft_fill_conj.h      |    4 +-
 paddle/phi/kernels/funcs/for_range.h          |    2 +-
 paddle/phi/kernels/funcs/gru_compute.cc       |    8 +-
 paddle/phi/kernels/funcs/inclusive_scan.h     |    2 +-
 paddle/phi/kernels/funcs/index_calculator.h   |    2 +-
 paddle/phi/kernels/funcs/index_put_utils.h    |    7 +-
 .../phi/kernels/funcs/interpolate_function.h  |    4 +-
 paddle/phi/kernels/funcs/isfinite_functor.h   |    6 +-
 paddle/phi/kernels/funcs/layer_norm_impl.cu.h |    2 +-
 paddle/phi/kernels/funcs/layer_norm_util.h    |    4 +-
 paddle/phi/kernels/funcs/load_store_util.h    |    2 +-
 paddle/phi/kernels/funcs/math_cuda_utils.h    |   17 +-
 paddle/phi/kernels/funcs/math_function.cc     |    2 +-
 paddle/phi/kernels/funcs/math_function.h      |    2 +-
 paddle/phi/kernels/funcs/matrix_inverse.cu    |    2 +-
 paddle/phi/kernels/funcs/matrix_solve.cu      |    2 +-
 paddle/phi/kernels/funcs/mode.h               |    4 +-
 paddle/phi/kernels/funcs/mufft_util.h         |  130 ++
 .../kernels/funcs/multihead_matmul_functor.cu |   10 +-
 paddle/phi/kernels/funcs/norm_utils.cu.h      |    2 +-
 paddle/phi/kernels/funcs/pooling.h            |    6 +-
 paddle/phi/kernels/funcs/reduce_function.h    |    6 +-
 paddle/phi/kernels/funcs/segmented_array.h    |    2 +-
 paddle/phi/kernels/funcs/select_impl.cu.h     |    4 +-
 .../kernels/funcs/skip_layernorm_functor.cu   |    8 +-
 .../kernels/funcs/skip_layernorm_functor.h    |    6 +
 paddle/phi/kernels/funcs/softmax.cu           |   36 +-
 paddle/phi/kernels/funcs/softmax.h            |    2 +-
 paddle/phi/kernels/funcs/sparse/softmax.cu.h  |    4 +
 paddle/phi/kernels/funcs/sparse/sparse_blas.h |    4 +
 paddle/phi/kernels/funcs/squared_l2_norm.h    |    6 +-
 paddle/phi/kernels/funcs/strided_memcpy.h     |    2 +-
 .../phi/kernels/funcs/top_k_function_cuda.h   |   29 +-
 .../cutlass/fused_conv2d_add_act_kernel.cu    |    1 +
 paddle/phi/kernels/fusion/gpu/block_attn.h    |    1 -
 .../fusion/gpu/fused_bias_act_kernel.cu       |    4 +-
 .../kernels/fusion/gpu/fused_bias_act_utils.h |    4 +-
 ...dropout_residual_layer_norm_grad_kernel.cu |    6 +-
 ...bias_dropout_residual_layer_norm_kernel.cu |    4 +-
 .../gpu/fused_bn_activation_grad_kernel.cu    |    2 +-
 .../fusion/gpu/fused_bn_activation_kernel.cu  |    2 +-
 .../fused_bn_add_activation_grad_kernel.cu    |    2 +-
 .../gpu/fused_bn_add_activation_kernel.cu     |    2 +-
 .../gpu/fused_dropout_add_grad_kernel.cu      |    6 +-
 .../fusion/gpu/fused_dropout_add_kernel.cu    |    6 +-
 .../fused_fc_elementwise_layernorm_kernel.cu  |    4 +-
 .../fusion/gpu/fused_layernorm_kernel.cu      |   11 +-
 .../fused_softmax_mask_upper_triangle_utils.h |    8 +-
 .../fusion/gpu/fused_softmax_mask_utils.h     |   10 +-
 .../gpu/masked_multihead_attention_kernel.cu  |    4 +-
 paddle/phi/kernels/fusion/gpu/mmha_util.cu.h  |    2 +-
 .../fusion/gpu/multihead_matmul_kernel.cu     |    2 +
 .../phi/kernels/gpu/activation_grad_kernel.cu |   12 +-
 paddle/phi/kernels/gpu/activation_kernel.cu   |   12 +-
 paddle/phi/kernels/gpu/all_gather_kernel.cu   |    4 +-
 paddle/phi/kernels/gpu/all_reduce_kernel.cu   |   21 +-
 paddle/phi/kernels/gpu/all_to_all_kernel.cu   |   41 +-
 paddle/phi/kernels/gpu/allclose_kernel.cu     |    2 +
 paddle/phi/kernels/gpu/arg_min_max_kernel.cu  |    4 +-
 paddle/phi/kernels/gpu/argsort_grad_kernel.cu |    3 +-
 paddle/phi/kernels/gpu/argsort_kernel.cu      |    2 +-
 paddle/phi/kernels/gpu/auc_kernel.cu          |   10 +-
 .../phi/kernels/gpu/batch_norm_grad_kernel.cu |   16 +-
 paddle/phi/kernels/gpu/batch_norm_kernel.cu   |   18 +-
 paddle/phi/kernels/gpu/bernoulli_kernel.cu    |   10 +-
 paddle/phi/kernels/gpu/broadcast_kernel.cu    |    4 +-
 .../phi/kernels/gpu/check_numerics_kernel.cu  |    6 +
 paddle/phi/kernels/gpu/cholesky_kernel.cu     |    2 +-
 .../kernels/gpu/cholesky_solve_grad_kernel.cu |    2 +-
 .../phi/kernels/gpu/cholesky_solve_kernel.cu  |    2 +-
 .../kernels/gpu/cross_entropy_grad_kernel.cu  |    4 +-
 .../phi/kernels/gpu/cross_entropy_kernel.cu   |   34 +-
 paddle/phi/kernels/gpu/cum_kernel.cu          |    2 +
 paddle/phi/kernels/gpu/cumprod_grad_kernel.cu |    2 +
 paddle/phi/kernels/gpu/decode_jpeg_kernel.cu  |    2 +-
 paddle/phi/kernels/gpu/dgc_kernel.cu          |    2 +-
 paddle/phi/kernels/gpu/dirichlet_kernel.cu    |   12 +
 paddle/phi/kernels/gpu/dist_concat_kernel.cu  |    4 +-
 paddle/phi/kernels/gpu/dist_kernel.cu         |    2 +-
 .../gpu/distribute_fpn_proposals_kernel.cu    |    2 +-
 paddle/phi/kernels/gpu/eigh_kernel.cu         |    2 +-
 paddle/phi/kernels/gpu/eigvalsh_kernel.cu     |    2 +-
 .../phi/kernels/gpu/embedding_grad_kernel.cu  |    3 +
 .../kernels/gpu/generate_proposals_kernel.cu  |    2 +-
 .../phi/kernels/gpu/graph_reindex_kernel.cu   |   12 +
 .../gpu/graph_sample_neighbors_kernel.cu      |   17 +
 .../kernels/gpu/graph_send_ue_recv_funcs.h    |    9 +
 paddle/phi/kernels/gpu/group_norm_kernel.cu   |   13 +-
 paddle/phi/kernels/gpu/group_norm_utils.h     |    2 +-
 .../phi/kernels/gpu/gumbel_softmax_kernel.cu  |    5 +-
 paddle/phi/kernels/gpu/instance_norm_utils.h  |    2 +-
 .../kernels/gpu/interpolate_grad_kernel.cu    |    2 +-
 paddle/phi/kernels/gpu/kthvalue_kernel.cu     |   12 +-
 paddle/phi/kernels/gpu/layer_norm_kernel.cu   |    2 +-
 .../phi/kernels/gpu/logsumexp_function.cu.h   |   58 +
 paddle/phi/kernels/gpu/lstsq_kernel.cu        |    2 +-
 paddle/phi/kernels/gpu/lu_kernel.cu           |    2 +-
 paddle/phi/kernels/gpu/matrix_rank_kernel.cu  |    2 +-
 .../phi/kernels/gpu/matrix_rank_tol_kernel.cu |    2 +-
 .../phi/kernels/gpu/multiclass_nms3_kernel.cu |    2 +-
 paddle/phi/kernels/gpu/multinomial_kernel.cu  |   12 +-
 .../phi/kernels/gpu/nll_loss_grad_kernel.cu   |    2 +
 paddle/phi/kernels/gpu/nll_loss_kernel.cu     |    2 +
 paddle/phi/kernels/gpu/nonzero_kernel.cu      |    2 +-
 paddle/phi/kernels/gpu/nop_kernel.cu          |    2 +-
 paddle/phi/kernels/gpu/norm_grad_kernel.cu    |    2 +-
 paddle/phi/kernels/gpu/norm_kernel.cu         |    2 +-
 paddle/phi/kernels/gpu/p_recv_kernel.cu       |   17 +-
 paddle/phi/kernels/gpu/p_send_kernel.cu       |   19 +-
 paddle/phi/kernels/gpu/poisson_kernel.cu      |   20 +-
 paddle/phi/kernels/gpu/qr_kernel.cu           |    2 +-
 paddle/phi/kernels/gpu/randperm_kernel.cu     |   15 +-
 paddle/phi/kernels/gpu/reduce.h               |    2 +-
 paddle/phi/kernels/gpu/reduce_grad.h          |    2 +-
 paddle/phi/kernels/gpu/reduce_kernel.cu       |   17 +-
 .../phi/kernels/gpu/reduce_scatter_kernel.cu  |    6 +-
 paddle/phi/kernels/gpu/rms_norm_kernel.cu     |    9 +-
 paddle/phi/kernels/gpu/rnn_functor.h          |   55 +
 paddle/phi/kernels/gpu/rnn_kernel.cu.cc       |    2 +-
 .../kernels/gpu/send_u_recv_grad_kernel.cu    |    2 +
 paddle/phi/kernels/gpu/send_u_recv_kernel.cu  |    2 +
 .../kernels/gpu/send_ue_recv_grad_kernel.cu   |   27 +
 paddle/phi/kernels/gpu/send_ue_recv_kernel.cu |    3 +
 paddle/phi/kernels/gpu/send_uv_grad_kernel.cu |   15 +
 paddle/phi/kernels/gpu/sgd_kernel.cu          |   16 +
 .../kernels/gpu/shuffle_batch_grad_kernel.cu  |    2 +-
 .../phi/kernels/gpu/shuffle_batch_kernel.cu   |    4 +-
 paddle/phi/kernels/gpu/shuffle_batch_utils.h  |    2 +-
 .../gpu/sigmoid_cross_entropy_with_logits.h   |    3 +-
 paddle/phi/kernels/gpu/strided_copy_kernel.cu |   11 +-
 paddle/phi/kernels/gpu/svd_kernel.cu          |    2 +-
 .../phi/kernels/gpu/top_p_sampling_kernel.cu  |   32 +-
 paddle/phi/kernels/gpu/unique_kernel.cu       |   10 +-
 .../phi/kernels/gpu/viterbi_decode_kernel.cu  |    2 +-
 .../gpu/weighted_sample_neighbors_kernel.cu   |    6 +
 .../kernels/gpudnn/affine_grid_grad_kernel.cu |    2 +-
 .../phi/kernels/gpudnn/affine_grid_kernel.cu  |    2 +-
 paddle/phi/kernels/gpudnn/softmax_gpudnn.h    |   12 +
 .../phi/kernels/gpudnn/softmax_grad_kernel.cu |    2 +-
 paddle/phi/kernels/gpudnn/softmax_kernel.cu   |    2 +-
 paddle/phi/kernels/group_norm_kernel.h        |    2 +-
 .../phi/kernels/impl/clip_grad_kernel_impl.h  |    4 +-
 paddle/phi/kernels/impl/clip_kernel_impl.h    |    4 +-
 paddle/phi/kernels/impl/complex_kernel_impl.h |    2 +-
 paddle/phi/kernels/impl/diag_embed_impl.h     |    4 +-
 .../phi/kernels/impl/dot_grad_kernel_impl.h   |   12 +-
 .../impl/elementwise_grad_kernel_impl.h       |    2 +-
 .../kernels/impl/elementwise_kernel_impl.h    |    2 +-
 .../phi/kernels/impl/fft_grad_kernel_impl.h   |    2 +-
 paddle/phi/kernels/impl/isclose_kernel_impl.h |    4 +-
 .../phi/kernels/impl/kron_grad_kernel_impl.h  |    4 +-
 paddle/phi/kernels/impl/kron_kernel_impl.h    |    4 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    |    4 +-
 .../phi/kernels/impl/polygamma_kernel_impl.h  |    4 +-
 paddle/phi/kernels/impl/pool_kernel_impl.h    |    4 +-
 .../kernels/impl/quant_linear_kernel_impl.h   |    2 +-
 paddle/phi/kernels/impl/renorm_impl.h         |    6 +-
 .../impl/repeat_interleave_grad_kernel_impl.h |   10 +-
 .../impl/repeat_interleave_kernel_impl.h      |    8 +-
 .../kernels/impl/segment_pool_kernel_impl.h   |    7 +-
 .../kernels/impl/sequence_mask_kernel_impl.h  |    4 +-
 .../phi/kernels/impl/solve_grad_kernel_impl.h |    4 +-
 .../phi/kernels/impl/trace_grad_kernel_impl.h |    4 +-
 .../kernels/impl/unstack_grad_kernel_impl.h   |    4 +-
 paddle/phi/kernels/impl/unstack_kernel_impl.h |    6 +-
 .../phi/kernels/impl/warprnnt_kernel_impl.h   |    2 +-
 paddle/phi/kernels/is_empty_kernel.cc         |    2 +-
 paddle/phi/kernels/kps/elementwise_kernel.cu  |    2 +-
 paddle/phi/kernels/layer_norm_kernel.h        |    2 +-
 paddle/phi/kernels/memcpy_kernel.cc           |    4 +-
 paddle/phi/kernels/npu_identity_kernel.cc     |    2 +-
 .../kernels/primitive/compute_primitives.h    |    6 +
 .../kernels/primitive/datamover_primitives.h  |    5 +
 paddle/phi/kernels/prod_kernel.cc             |    2 +-
 paddle/phi/kernels/reduce_all_kernel.cc       |    2 +-
 paddle/phi/kernels/reduce_amax_kernel.cc      |    2 +-
 paddle/phi/kernels/reduce_amin_kernel.cc      |    2 +-
 paddle/phi/kernels/reduce_any_kernel.cc       |    2 +-
 paddle/phi/kernels/reduce_mean_kernel.cc      |    2 +-
 paddle/phi/kernels/reduce_min_kernel.cc       |    2 +-
 paddle/phi/kernels/reduce_sum_kernel.cc       |    2 +-
 paddle/phi/kernels/reverse_kernel.cc          |    2 +-
 .../selected_rows/activation_kernel.cc        |    2 +-
 .../kernels/selected_rows/assign_kernel.cc    |    2 +-
 .../elementwise_multiply_kernel.cc            |    2 +-
 .../phi/kernels/selected_rows/full_kernel.cc  |    6 +-
 .../kernels/selected_rows/isfinite_kernel.cc  |    4 +-
 .../merge_selected_rows_kernel.cc             |    2 +-
 .../phi/kernels/selected_rows/scale_kernel.cc |    2 +-
 .../phi/kernels/selected_rows/shape_kernel.cc |    2 +-
 .../kernels/selected_rows/uniform_kernel.cc   |    2 +-
 paddle/phi/kernels/shape_kernel.cc            |    2 +-
 .../kernels/sparse/gpu/softmax_grad_kernel.cu |    3 +
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |    7 +
 paddle/phi/kernels/squeeze_grad_kernel.cc     |    2 +-
 paddle/phi/kernels/squeeze_kernel.cc          |    2 +-
 .../phi/kernels/stride/as_complex_kernel.cc   |    2 +-
 paddle/phi/kernels/stride/as_real_kernel.cc   |    2 +-
 .../phi/kernels/stride/complex_grad_kernel.cc |    2 +-
 paddle/phi/kernels/stride/complex_kernel.cc   |    2 +-
 .../phi/kernels/strided_slice_grad_kernel.cc  |    2 +-
 paddle/phi/kernels/strided_slice_kernel.cc    |    2 +-
 paddle/phi/kernels/strings/case_utils.h       |    2 +-
 paddle/phi/kernels/strings/gpu/copy_utils.h   |   10 +-
 .../kernels/strings/strings_empty_kernel.cc   |    2 +-
 paddle/phi/kernels/strings/unicode.cc         |   10 +-
 paddle/phi/kernels/strings/unicode.h          |    2 +-
 paddle/phi/kernels/transfer_layout_kernel.cc  |    4 +-
 paddle/phi/kernels/unsqueeze_grad_kernel.cc   |    2 +-
 paddle/phi/kernels/unsqueeze_kernel.cc        |    2 +-
 paddle/phi/tools/CMakeLists.txt               |    4 +
 patches/eigen/Complex.h.patch                 |   33 +-
 patches/eigen/Eigen_CORE.patch                |   13 +
 ...c_Core_util_ConfigureVectorization.h.patch |   21 +
 .../eigen/Eigen_src_Core_util_Macros.h.patch  |   51 +
 .../eigen/Eigen_src_Core_util_Meta.h.patch    |   58 +
 patches/eigen/TensorReductionGpu.h            |    2 +-
 .../unsupported_Eigen_CXX11_Tensor.patch      |   13 +
 ...11_src_Tensor_TensorContractionGpu.h.patch |   22 +
 ...X11_src_Tensor_TensorDeviceDefault.h.patch |   15 +
 ...n_CXX11_src_Tensor_TensorDeviceGpu.h.patch |   15 +
 ...src_Tensor_TensorGpuHipCudaDefines.h.patch |   40 +
 ...n_CXX11_src_Tensor_TensorReduction.h.patch |   13 +
 python/CMakeLists.txt                         |    2 +
 python/env_dict.py.in                         |    1 +
 python/paddle/__init__.py                     |    1 +
 python/paddle/base/__init__.py                |    1 +
 python/paddle/base/framework.py               |   14 +
 python/paddle/device/__init__.py              |    2 +
 python/paddle/device/cuda/graphs.py           |    3 +-
 .../paddle/distributed/fleet/launch_utils.py  |    2 +-
 .../distributed/fleet/layers/mpu/mp_layers.py |    2 +-
 .../fleet/utils/sequence_parallel_utils.py    |    1 +
 .../paddle/distributed/launch/utils/nvsmi.py  |    2 +
 .../paddle/distributed/utils/launch_utils.py  |    2 +-
 python/paddle/nn/functional/conv.py           |    1 +
 python/paddle/nn/functional/vision.py         |    2 +-
 python/setup.py.in                            |    2 +-
 test/collective/fleet/CMakeLists.txt          |    4 +-
 .../fleet/test_parallel_dygraph_qat.py        |    2 +-
 test/cpp/fluid/nccl/CMakeLists.txt            |    2 +-
 test/cpp/fluid/nccl/nccl_op_test.cu.cc        |   12 +-
 test/cpp/imperative/CMakeLists.txt            |    3 +-
 test/cpp/imperative/nccl_context_test.cc      |   10 +-
 .../test_collective_process_group_xccl.py     |    2 +-
 test/legacy_test/CMakeLists.txt               |    2 +-
 test/legacy_test/test_dist_hapi_model.py      |    2 +-
 .../test_parallel_dygraph_dataparallel.py     |    2 +-
 ...t_parallel_dygraph_dataparallel_cpuonly.py |    2 +-
 .../xpu/test_parallel_dygraph_dataparallel.py |    2 +-
 tools/enforce/grep_invalid_enforce.sh         |    2 +-
 746 files changed, 10481 insertions(+), 2466 deletions(-)
 create mode 100644 cmake/mccl.cmake
 create mode 100644 cmake/mudnn.cmake
 create mode 100644 cmake/musa.cmake
 create mode 100644 paddle/fluid/platform/device/gpu/musa/musa_helper.h
 create mode 100644 paddle/fluid/platform/dynload/mccl.cc
 create mode 100644 paddle/fluid/platform/dynload/mccl.h
 create mode 100644 paddle/fluid/platform/dynload/mublas.cc
 create mode 100644 paddle/fluid/platform/dynload/mublas.h
 create mode 100644 paddle/fluid/platform/dynload/mudnn.cc
 create mode 100644 paddle/fluid/platform/dynload/mudnn.h
 create mode 100644 paddle/fluid/platform/dynload/mufft.cc
 create mode 100644 paddle/fluid/platform/dynload/mufft.h
 create mode 100644 paddle/fluid/platform/dynload/murand.cc
 create mode 100644 paddle/fluid/platform/dynload/murand.h
 create mode 100644 paddle/fluid/platform/dynload/musa_driver.cc
 create mode 100644 paddle/fluid/platform/dynload/musa_driver.h
 create mode 100644 paddle/fluid/platform/dynload/musartc.cc
 create mode 100644 paddle/fluid/platform/dynload/musartc.h
 create mode 100644 paddle/fluid/platform/dynload/musparse.cc
 create mode 100644 paddle/fluid/platform/dynload/musparse.h
 create mode 100644 paddle/phi/backends/dynload/mccl.cc
 create mode 100644 paddle/phi/backends/dynload/mccl.h
 create mode 100644 paddle/phi/backends/dynload/mublas.cc
 create mode 100644 paddle/phi/backends/dynload/mublas.h
 create mode 100644 paddle/phi/backends/dynload/mudnn.cc
 create mode 100644 paddle/phi/backends/dynload/mudnn.h
 create mode 100644 paddle/phi/backends/dynload/mufft.cc
 create mode 100644 paddle/phi/backends/dynload/mufft.h
 create mode 100644 paddle/phi/backends/dynload/murand.cc
 create mode 100644 paddle/phi/backends/dynload/murand.h
 create mode 100644 paddle/phi/backends/dynload/musa_driver.cc
 create mode 100644 paddle/phi/backends/dynload/musa_driver.h
 create mode 100644 paddle/phi/backends/dynload/musartc.cc
 create mode 100644 paddle/phi/backends/dynload/musartc.h
 create mode 100644 paddle/phi/backends/dynload/musparse.cc
 create mode 100644 paddle/phi/backends/dynload/musparse.h
 create mode 100644 paddle/phi/backends/gpu/musa/mudnn_desc.h
 create mode 100644 paddle/phi/backends/gpu/musa/mudnn_helper.h
 create mode 100644 paddle/phi/backends/gpu/musa/musa_device_function.h
 create mode 100644 paddle/phi/backends/gpu/musa/musa_helper.h
 create mode 100644 paddle/phi/backends/gpu/musa/musa_info.cc
 create mode 100644 paddle/phi/kernels/funcs/blas/blas_impl.mu.h
 create mode 100644 paddle/phi/kernels/funcs/mufft_util.h
 create mode 100644 patches/eigen/Eigen_CORE.patch
 create mode 100644 patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
 create mode 100644 patches/eigen/Eigen_src_Core_util_Macros.h.patch
 create mode 100644 patches/eigen/Eigen_src_Core_util_Meta.h.patch
 create mode 100644 patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
 create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
 create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
 create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
 create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
 create mode 100644 patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9f3fafe8d22ad..057865eaf9b54d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,13 +41,14 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 
 project(paddle CXX C)
-
+# set(CMAKE_VERBOSE_MAKEFILE ON)
 # enable language CUDA
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
 option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
-option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
+option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" OFF)
+option(WITH_MUSA "Compile PaddlePaddle with MUSA" ON)
 option(WITH_MPI "Compile PaddlePaddle with MPI" OFF)
 option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
 option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
@@ -89,6 +90,9 @@ endif()
 if(WITH_GPU AND WITH_ROCM)
   message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
+if(WITH_GPU AND WITH_MUSA)
+  message(FATAL_ERROR "Error when compile CUDA and MUSA at the same time")
+endif()
 
 if(WITH_GPU AND NOT APPLE)
   enable_language(CUDA)
@@ -285,6 +289,7 @@ option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
 option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
 option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
+option(WITH_MCCL "Compile PaddlePaddle with MCCL support" ON)
 option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
 option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
 option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
@@ -352,6 +357,7 @@ endif()
 if(LINUX
    AND NOT WITH_CUSTOM_DEVICE
    AND NOT WITH_GPU
+   AND NOT WITH_MUSA
    AND NOT WITH_ROCM
    AND NOT WITH_XPU
    AND NOT WITH_XPU_KP
@@ -404,6 +410,14 @@ if(NOT WITH_GPU AND WITH_NCCL)
       CACHE STRING "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
+if(NOT WITH_MUSA AND WITH_MCCL)
+  message(
+    WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.")
+  set(WITH_MCCL
+      OFF
+      CACHE STRING "Disable MCCL when compiling without MUSA" FORCE)
+endif()
+
 if(NOT WITH_GPU AND WITH_CUDNN_DSO)
   message(
     WARNING
@@ -461,6 +475,19 @@ else()
   endif()
 endif()
 
+if(WITH_MCCL)
+  add_definitions("-DPADDLE_WITH_MCCL")
+  include(mccl)
+else()
+  if(WITH_MUSA)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
+endif()
+
+
 if(WITH_BRPC_RDMA)
   message(STATUS "Use brpc with rdma.")
   if(NOT WITH_DISTRIBUTE)
@@ -486,6 +513,11 @@ if(WITH_ROCM)
   include(cupti)
 endif()
 
+if(WITH_MUSA)
+  include(musa)
+  include(mudnn)
+endif()
+
 if(WITH_XPU_KP)
   include(xpu_kp)
 endif()
@@ -498,6 +530,14 @@ if(NOT WITH_ROCM AND WITH_RCCL)
       CACHE STRING "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
+if(NOT WITH_MUSA AND WITH_MCCL)
+  message(
+    WARNING "Disable MCCL when compiling without MUSA. Force WITH_MCCL=OFF.")
+  set(WITH_MCCL
+      OFF
+      CACHE STRING "Disable MCCL when compiling without MUSA" FORCE)
+endif()
+
 if(WITH_RCCL)
   add_definitions("-DPADDLE_WITH_RCCL")
   include(rccl)
@@ -510,6 +550,18 @@ else()
   endif()
 endif()
 
+if(WITH_MCCL)
+  add_definitions("-DPADDLE_WITH_MCCL")
+  include(mccl)
+else()
+  if(WITH_MUSA)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_MCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
+endif()
+
 if(WITH_HETERPS AND WITH_PSLIB)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
@@ -560,6 +612,13 @@ if(WITH_RPC)
         OFF
         CACHE BOOL "Disable WITH_RPC when compiling with ROCM" FORCE)
   endif()
+  if(WITH_MUSA AND WITH_RPC)
+  message(
+    WARNING "Disable WITH_RPC when compiling with MUSA. Force WITH_RPC=OFF.")
+  set(WITH_RPC
+      OFF
+      CACHE BOOL "Disable WITH_RPC when compiling with MUSA" FORCE)
+  endif()
   if(WITH_XPU AND WITH_RPC)
     message(
       WARNING "Disable WITH_RPC when compiling with XPU. Force WITH_RPC=OFF.")
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index dc661fce388fe1..29cca57db65891 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -175,6 +175,19 @@ elseif(WITH_ROCM)
   if(${MIOPEN_VERSION} VERSION_LESS 2090)
     message(FATAL_ERROR "Paddle needs MIOPEN >= 2.9 to compile")
   endif()
+elseif(WITH_MUSA)
+  add_definitions(-DPADDLE_WITH_MUSA)
+  add_definitions(-DEIGEN_USE_GPU)
+  add_definitions(-DEIGEN_USE_MUSA)
+  if(MUPTI_FOUND)
+    include_directories(${CUPTI_INCLUDE_DIR})
+    add_definitions(-DPADDLE_WITH_MUPTI)
+  else()
+    message(STATUS "Cannot find MUPTI, GPU Profiling is incorrect.")
+  endif()
+  if(NOT MUDNN_FOUND)
+    message(FATAL_ERROR "Paddle needs mudnn to compile")
+  endif()
 else()
   add_definitions(-DHPPL_STUB_FUNC)
   list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
index eb7ad44af2313f..5967b468d65ce5 100644
--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
@@ -1,4 +1,4 @@
-if(NOT WITH_GPU AND NOT WITH_ROCM)
+if(NOT WITH_GPU AND NOT WITH_ROCM AND NOT WITH_MUSA)
   return()
 endif()
 
@@ -6,6 +6,10 @@ if(WITH_ROCM)
   set(CUPTI_ROOT
       "${ROCM_PATH}/cuda/extras/CUPTI"
       CACHE PATH "CUPTI ROOT")
+elseif(WITH_MUSA)
+  set(CUPTI_ROOT
+      "/usr/local/musa"
+      CACHE PATH "CUPTI ROOT")      
 else()
   set(CUPTI_ROOT
       "/usr"
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 06e37b3c8a6028..4051a09d767f6b 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -60,6 +60,76 @@ if(CMAKE_COMPILER_IS_GNUCC)
         ${EIGEN_PATCH_COMMAND} && patch -Nd
         ${SOURCE_DIR}/Eigen/src/Core/arch/SSE/ < ${complex_header})
   endif()
+  if(WITH_MUSA)
+    file(
+      TO_NATIVE_PATH
+      ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
+      configure_vectorization_header)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/
+        < ${configure_vectorization_header})
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Macros.h.patch
+         util_macros_header)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd ${SOURCE_DIR}/Eigen/src/Core/util/
+        < ${util_macros_header})
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_src_Core_util_Meta.h.patch
+         meta_header)
+    set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd
+                            ${SOURCE_DIR}/Eigen/src/Core/util/ < ${meta_header})
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
+         cxx11_tensor)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd
+        ${SOURCE_DIR}/unsupported/Eigen/CXX11/ < ${cxx11_tensor})
+    file(
+      TO_NATIVE_PATH
+      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
+      tensor_contraction_gpu_header)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd
+        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
+        ${tensor_contraction_gpu_header})
+    file(
+      TO_NATIVE_PATH
+      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
+      tensor_device_default_header)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd
+        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
+        ${tensor_device_default_header})
+    file(
+      TO_NATIVE_PATH
+      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
+      tensor_gpu_hip_cuda_defines_header)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd
+        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
+        ${tensor_gpu_hip_cuda_defines_header})
+    file(
+      TO_NATIVE_PATH
+      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch
+      tensor_reduction_header)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd
+        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
+        ${tensor_reduction_header})
+    file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Eigen_CORE.patch
+         eigen_core)
+    set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && patch -Nd
+                            ${SOURCE_DIR}/Eigen/ < ${eigen_core})
+    file(
+      TO_NATIVE_PATH
+      ${PADDLE_SOURCE_DIR}/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
+      tensor_device_gpu_header)
+    set(EIGEN_PATCH_COMMAND
+        ${EIGEN_PATCH_COMMAND} && patch -Nd
+        ${SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/ <
+        ${tensor_device_gpu_header})
+  endif()
 endif()
 
 set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7a4956e6e15567..8d6384d2f0a141 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -246,6 +246,11 @@ if(WITH_GPU)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 endif()
 
+if(WITH_MUSA)
+  set(CMAKE_MUSA_FLAGS "${CMAKE_MUSA_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
+endif()
+
+
 if(WITH_ROCM)
   set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${SAFE_GPU_COMMON_FLAGS}")
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c463dbc6064e12..e506f2e3714da5 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -453,6 +453,9 @@ function(cc_binary TARGET_NAME)
   if(WITH_ROCM)
     target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
   endif()
+  if(WITH_MUSA)
+    target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
+  endif()
 
   check_coverage_opt(${TARGET_NAME} ${cc_binary_SRCS})
 
@@ -481,6 +484,12 @@ function(cc_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
+    if(WITH_MUSA)
+      target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
+      # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/'
+      target_link_options(${TARGET_NAME} PRIVATE
+                          -Wl,-rpath,/usr/lib/x86_64-linux-gnu/)
+    endif(())
     check_coverage_opt(${TARGET_NAME} ${cc_test_SRCS})
   endif()
 endfunction()
@@ -619,6 +628,12 @@ function(paddle_test_build TARGET_NAME)
     if(WITH_ROCM)
       target_link_libraries(${TARGET_NAME} ${ROCM_HIPRTC_LIB})
     endif()
+    if(WITH_MUSA)
+      target_link_libraries(${TARGET_NAME} ${MUSARTC_LIB})
+      # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/'
+      target_link_options(${TARGET_NAME} PRIVATE
+                          -Wl,-rpath,/usr/lib/x86_64-linux-gnu/)
+    endif()
     if(APPLE)
       target_link_libraries(
         ${TARGET_NAME}
@@ -750,6 +765,115 @@ function(nv_test TARGET_NAME)
   endif()
 endfunction()
 
+
+
+function(musa_library TARGET_NAME)
+  if(WITH_MUSA)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    if(musa_library_SRCS)
+      if(musa_library_SHARED OR musa_library_shared) # build *.so
+        musa_add_library(${TARGET_NAME} SHARED ${musa_library_SRCS})
+      else()
+        musa_add_library(${TARGET_NAME} STATIC ${musa_library_SRCS})
+        find_fluid_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
+      endif()
+      if(musa_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${musa_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND musa_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else()
+      if(musa_library_DEPS)
+        list(REMOVE_DUPLICATES musa_library_DEPS)
+        generate_dummy_static_lib(
+          LIB_NAME ${TARGET_NAME} FILE_PATH ${target_SRCS} GENERATOR
+          "generic.cmake:musa_library")
+
+        target_link_libraries(${TARGET_NAME} ${musa_library_DEPS})
+        add_dependencies(${TARGET_NAME} ${musa_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in musa_library.")
+      endif()
+    endif()
+  endif()
+endfunction()
+
+function(musa_binary TARGET_NAME)
+  if(WITH_MUSA)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${musa_binary_SRCS})
+    if(musa_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${musa_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${musa_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction()
+
+function(musa_test TARGET_NAME)
+  if(WITH_MUSA AND WITH_TESTING)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(musa_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    musa_add_executable(${TARGET_NAME} ${musa_test_SRCS})
+    # "-pthread -ldl -lrt" is defined in CMAKE_CXX_LINK_EXECUTABLE
+    target_link_options(${TARGET_NAME} PRIVATE -pthread -ldl -lrt)
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${musa_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      glog
+      phi
+      ${os_dependency_modules})
+    add_dependencies(
+      ${TARGET_NAME}
+      ${musa_test_DEPS}
+      paddle_gtest_main
+      lod_tensor
+      memory
+      gtest
+      phi
+      glog)
+    common_link(${TARGET_NAME})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cpu_deterministic=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_init_allocated_mem=true)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT
+                                              FLAGS_cudnn_deterministic=true)
+    set_property(
+      TEST ${TARGET_NAME}
+      PROPERTY
+        ENVIRONMENT
+        "LD_LIBRARY_PATH=${CMAKE_BINARY_DIR}/python/paddle/libs:$LD_LIBRARY_PATH"
+    )
+  endif()
+endfunction()
+
+
+
 function(hip_library TARGET_NAME)
   if(WITH_ROCM)
     set(options STATIC static SHARED shared)
@@ -1375,6 +1499,15 @@ function(math_library TARGET)
       ${TARGET}
       SRCS ${cc_srcs} ${cu_srcs}
       DEPS ${math_library_DEPS} ${math_common_deps})
+  elseif(WITH_MUSA)
+    musa_library(
+      ${TARGET}
+      SRCS
+      ${cc_srcs}
+      ${cu_srcs}
+      DEPS
+      ${math_library_DEPS}
+      ${math_common_deps})
   elseif(${cc_srcs_len} GREATER 0)
     cc_library(
       ${TARGET}
diff --git a/cmake/mccl.cmake b/cmake/mccl.cmake
new file mode 100644
index 00000000000000..5ce4ea9c25fec0
--- /dev/null
+++ b/cmake/mccl.cmake
@@ -0,0 +1,51 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+# Now we don't support MCCL on windows
+if(WIN32)
+  return()
+endif()
+
+if(WITH_MCCL)
+  set(MCCL_ROOT
+      "/usr/local/musa/"
+      CACHE PATH "MCCL ROOT")
+  find_path(
+    MCCL_INCLUDE_DIR mccl.h
+    PATHS ${MCCL_ROOT} ${MCCL_ROOT}/include ${MCCL_ROOT}/local/include
+          $ENV{MCCL_ROOT} $ENV{MCCL_ROOT}/include $ENV{MCCL_ROOT}/local/include
+    NO_DEFAULT_PATH)
+
+  if(MCCL_INCLUDE_DIR)
+    file(READ ${MCCL_INCLUDE_DIR}/mccl.h MCCL_VERSION_FILE_CONTENTS)
+
+    string(REGEX MATCH "define MCCL_MAJOR +([0-9]+)" MCCL_MAJOR_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_MAJOR +([0-9]+)" "\\1" MCCL_MAJOR_VERSION
+                         "${MCCL_MAJOR_VERSION}")
+    string(REGEX MATCH "define MCCL_MINOR +([0-9]+)" MCCL_MINOR_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_MINOR +([0-9]+)" "\\1" MCCL_MINOR_VERSION
+                         "${MCCL_MINOR_VERSION}")
+    string(REGEX MATCH "define MCCL_PATCH +([0-9]+)" MCCL_PATCH_VERSION
+                 "${MCCL_VERSION_FILE_CONTENTS}")
+    string(REGEX REPLACE "define MCCL_PATCH +([0-9]+)" "\\1" MCCL_PATCH_VERSION
+                         "${MCCL_PATCH_VERSION}")
+    if(NOT MCCL_MAJOR_VERSION)
+      set(MCCL_VERSION "???")
+    else()
+      math(EXPR MCCL_VERSION "${MCCL_MAJOR_VERSION} * 1000 +
+                 ${MCCL_MINOR_VERSION} * 100 + ${MCCL_PATCH_VERSION}")
+    endif()
+    include_directories(${MCCL_INCLUDE_DIR})
+
+    message(STATUS "Current MCCL header is ${MCCL_INCLUDE_DIR}/mccl.h. ")
+    message(
+      STATUS
+        "Current MCCL version is "
+        "v${MCCL_MAJOR_VERSION}.${MCCL_MINOR_VERSION}.${MCCL_PATCH_VERSION} ")
+  else()
+    message(FATAL_ERROR "WITH_MCCL is enabled but mccl.h file is not found!")
+  endif()
+endif()
diff --git a/cmake/mudnn.cmake b/cmake/mudnn.cmake
new file mode 100644
index 00000000000000..81027890d144e3
--- /dev/null
+++ b/cmake/mudnn.cmake
@@ -0,0 +1,92 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+if(WIN32)
+  return()
+else()
+  set(MUDNN_ROOT
+      "/usr/local/musa"
+      CACHE PATH "MUDNN ROOT")
+endif()
+
+find_path(
+  MUDNN_INCLUDE_DIR mudnn.h
+  PATHS ${MUDNN_ROOT} ${MUDNN_ROOT}/include $ENV{MUDNN_ROOT}
+        $ENV{MUDNN_ROOT}/include ${MUSA_TOOLKIT_INCLUDE}
+  NO_DEFAULT_PATH)
+
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+
+list(
+  APPEND
+  MUDNN_CHECK_LIBRARY_DIRS
+  ${MUDNN_ROOT}
+  ${MUDNN_ROOT}/lib64
+  ${MUDNN_ROOT}/lib
+  ${MUDNN_ROOT}/lib/x64
+  ${MUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+  $ENV{MUDNN_ROOT}
+  $ENV{MUDNN_ROOT}/lib64
+  $ENV{MUDNN_ROOT}/lib
+  $ENV{MUDNN_ROOT}/lib/x64
+  /usr/lib
+  ${MUSA_TOOLKIT_ROOT_DIR}
+  ${MUSA_TOOLKIT_ROOT_DIR}/lib/x64)
+set(MUDNN_LIB_NAME "")
+
+if(LINUX)
+  set(MUDNN_LIB_NAME "libmudnn.so")
+endif()
+
+find_library(
+  MUDNN_LIBRARY
+  NAMES ${MUDNN_LIB_NAME}
+  PATHS ${MUDNN_CHECK_LIBRARY_DIRS} ${MUDNN_INCLUDE_DIR}
+  NO_DEFAULT_PATH
+  DOC "Path to muDNN library.")
+
+if(MUDNN_INCLUDE_DIR AND MUDNN_LIBRARY)
+  set(MUDNN_FOUND ON)
+else()
+  set(MUDNN_FOUND OFF)
+endif()
+
+macro(find_mudnn_version mudnn_version_file)
+  file(READ ${mudnn_version_file} MUDNN_VERSION_FILE_CONTENTS)
+  get_filename_component(MUDNN_LIB_PATH ${MUDNN_LIBRARY} DIRECTORY)
+
+  string(REGEX MATCH "define MUDNN_VERSION_MAJOR +([0-9]+)" MUDNN_MAJOR_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_MAJOR +([0-9]+)" "\\1"
+                       MUDNN_MAJOR_VERSION "${MUDNN_MAJOR_VERSION}")
+  string(REGEX MATCH "define MUDNN_VERSION_MINOR +([0-9]+)" MUDNN_MINOR_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_MINOR +([0-9]+)" "\\1"
+                       MUDNN_MINOR_VERSION "${MUDNN_MINOR_VERSION}")
+  string(REGEX MATCH "define MUDNN_VERSION_PATCH +([0-9]+)" MUDNN_PATCH_VERSION
+               "${MUDNN_VERSION_FILE_CONTENTS}")
+  string(REGEX REPLACE "define MUDNN_VERSION_PATCH +([0-9]+)" "\\1"
+                       MUDNN_PATCH_VERSION "${MUDNN_PATCH_VERSION}")
+
+  if(NOT MUDNN_MAJOR_VERSION)
+    set(MUDNN_VERSION "???")
+  else()
+    add_definitions("-DMUDNN_MAJOR_VERSION=\"${MUDNN_MAJOR_VERSION}\"")
+    math(EXPR MUDNN_VERSION "${MUDNN_MAJOR_VERSION} * 1000 +
+               ${MUDNN_MINOR_VERSION} * 100 + ${MUDNN_PATCH_VERSION}")
+    message(STATUS "Current muDNN version file is ${mudnn_version_file} ")
+    message(
+      STATUS
+        "Current muDNN version is v${MUDNN_MAJOR_VERSION}.${MUDNN_MINOR_VERSION}.${MUDNN_PATCH_VERSION}. "
+    )
+  endif()
+endmacro()
+
+if(MUDNN_FOUND)
+  find_mudnn_version(${MUDNN_INCLUDE_DIR}/mudnn_version.h)
+  include_directories(${MUDNN_INCLUDE_DIR})
+endif()
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
new file mode 100644
index 00000000000000..63a85e827061cf
--- /dev/null
+++ b/cmake/musa.cmake
@@ -0,0 +1,128 @@
+if(NOT WITH_MUSA)
+  return()
+endif()
+
+if(NOT DEFINED ENV{MUSA_PATH})
+  set(MUSA_PATH
+      "/usr/local/musa"
+      CACHE PATH "Path to which ROCm has been installed")
+else()
+  set(MUSA_PATH
+      $ENV{MUSA_PATH}
+      CACHE PATH "Path to which ROCm has been installed")
+endif()
+set(CMAKE_MODULE_PATH "${MUSA_PATH}/cmake" ${CMAKE_MODULE_PATH})
+
+find_package(MUSA REQUIRED)
+include_directories(${MUSA_PATH}/include)
+
+# set openmp include directory
+set(llvm_openmp_search_list)
+foreach(item RANGE 6 20 1)
+  list(APPEND llvm_openmp_search_list /usr/lib/llvm-${item}/include/openmp/)
+endforeach()
+
+find_path(
+  OPENMP_INCLUDE_DIR omp.h
+  PATHS ${llvm_openmp_search_list} REQUIRED
+  NO_DEFAULT_PATH)
+include_directories(${OPENMP_INCLUDE_DIR})
+
+macro(find_musa_version musa_version_file)
+  set(python_file ${PROJECT_BINARY_DIR}/get_version.py)
+  set(MUSA_VERSION
+      "None"
+      CACHE STRING "musa version" FORCE)
+  file(
+    WRITE ${python_file}
+    ""
+    "import json\n"
+    "import sys\n"
+    "with open(sys.argv[1], 'r') as f:\n"
+    "    data = json.load(f)\n"
+    "    print(data[\"musa_runtime\"][\"version\"])"
+    "")
+
+  execute_process(
+    COMMAND "python" "${python_file}" ${musa_version_file}
+    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+    RESULT_VARIABLE python_res
+    OUTPUT_VARIABLE python_out
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(python_res EQUAL 0)
+    set(MUSA_VERSION ${python_out})
+  endif()
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\1" MUSA_MAJOR_VERSION
+                       "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\2" MUSA_MINOR_VERSION
+                       "${MUSA_VERSION}")
+  string(REGEX REPLACE "([0-9]+)\.([0-9]+)\.([0-9]+)" "\\3" MUSA_PATCH_VERSION
+                       "${MUSA_VERSION}")
+
+  if(NOT MUSA_MAJOR_VERSION)
+    set(MUSA_VERSION "???")
+    message(WARNING "Cannot find MUSA version in ${MUSA_PATH}/version.json")
+  else()
+    math(
+      EXPR
+      MUSA_VERSION
+      "${MUSA_MAJOR_VERSION} * 10000 + ${MUSA_MINOR_VERSION} * 100   + ${MUSA_PATCH_VERSION}"
+    )
+    message(STATUS "Current MUSA version file is ${MUSA_PATH}/version.json.")
+    message(
+      STATUS
+        "Current MUSA version is v${MUSA_MAJOR_VERSION}.${MUSA_MINOR_VERSION}.${MUSA_PATCH_VERSION} "
+    )
+  endif()
+endmacro()
+find_musa_version(${MUSA_PATH}/version.json)
+
+list(APPEND MUSA_MCC_FLAGS -Wno-macro-redefined)
+list(APPEND MUSA_MCC_FLAGS -Wno-deprecated-copy-with-user-provided-copy)
+list(APPEND MUSA_MCC_FLAGS -Wno-pragma-once-outside-header)
+list(APPEND MUSA_MCC_FLAGS -Wno-return-type)
+list(APPEND MUSA_MCC_FLAGS -Wno-sign-compare)
+list(APPEND MUSA_MCC_FLAGS -Wno-overloaded-virtual)
+list(APPEND MUSA_MCC_FLAGS -Wno-mismatched-tags)
+list(APPEND MUSA_MCC_FLAGS -Wno-pessimizing-move)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-but-set-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-bitwise-instead-of-logical)
+list(APPEND MUSA_MCC_FLAGS -Wno-format)
+list(APPEND MUSA_MCC_FLAGS -Wno-self-assign)
+list(APPEND MUSA_MCC_FLAGS -Wno-literal-conversion)
+list(APPEND MUSA_MCC_FLAGS -Wno-literal-range)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-private-field)
+list(APPEND MUSA_MCC_FLAGS -Wno-unknown-warning-option)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-variable)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-value)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-local-typedef)
+list(APPEND MUSA_MCC_FLAGS -Wno-unused-lambda-capture)
+list(APPEND MUSA_MCC_FLAGS -Wno-reorder-ctor)
+list(APPEND MUSA_MCC_FLAGS -Wno-braced-scalar-init)
+list(APPEND MUSA_MCC_FLAGS -Wno-pass-failed)
+list(APPEND MUSA_MCC_FLAGS -Wno-missing-braces)
+list(APPEND MUSA_MCC_FLAGS -Wno-dangling-gsl)
+
+if(WITH_CINN)
+  list(APPEND MUSA_MCC_FLAGS -std=c++14)
+else()
+  list(APPEND MUSA_MCC_FLAGS -std=c++17)
+endif()
+
+list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_22)
+list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
+# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
+list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__)
+
+#set(MUSA_VERBOSE_BUILD ON)
+if(CMAKE_BUILD_TYPE MATCHES Debug)
+  list(APPEND MUSA_MCC_FLAGS -g2)
+  list(APPEND MUSA_MCC_FLAGS -O0)
+else()
+  list(APPEND MUSA_MCC_FLAGS -O2)
+endif()
+
+set(musa_runtime_library_name musart)
+find_library(MUSARTC_LIB ${musa_runtime_library_name} HINTS ${MUSA_PATH}/lib)
+message(STATUS "MUSARTC_LIB: ${MUSARTC_LIB}")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 95273118c25057..825e7612f3487f 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,6 +84,11 @@ function(register_cu_kernel TARGET)
       ${TARGET}
       SRCS ${cu_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
+  elseif(WITH_MUSA)
+    musa_library(
+      ${TARGET}
+      SRCS ${cu_srcs}
+      DEPS ${op_library_DEPS} ${op_common_deps})      
   endif()
   set(OP_LIBRARY
       ${TARGET} ${OP_LIBRARY}
@@ -151,14 +156,18 @@ function(op_library TARGET)
   set(cc_srcs)
   set(cu_srcs)
   set(hip_srcs)
+  set(mu_srcs)
   set(cu_cc_srcs)
   set(hip_cc_srcs)
+  set(mu_cc_srcs)
   set(xpu_cc_srcs)
   set(xpu_kp_cc_srcs)
   set(cudnn_cu_cc_srcs)
   set(miopen_cu_cc_srcs)
+  set(mudnn_cu_cc_srcs)
   set(cudnn_cu_srcs)
   set(miopen_cu_srcs)
+  set(mudnn_cu_srcs)
   set(CUDNN_FILE)
   set(MIOPEN_FILE)
   set(mkldnn_cc_srcs)
@@ -237,6 +246,35 @@ function(op_library TARGET)
         list(APPEND miopen_cu_srcs ${MIOPEN_FILE}.cu)
       endif()
     endif()
+    if(WITH_MUSA)
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+        list(APPEND mu_cc_srcs ${TARGET}.cu.cc)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND mu_srcs ${TARGET}.cu)
+      endif()
+      # rename in KP: .kps -> .cu
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps)
+        file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+        file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps
+             ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+        list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+        set(PART_CUDA_KERNEL_FILES
+            ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu
+            ${PART_CUDA_KERNEL_FILES}
+            PARENT_SCOPE)
+        list(APPEND mu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu)
+      endif()
+      string(REPLACE "_op" "_cudnn_op" MUDNN_FILE "${TARGET}")
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu.cc)
+        list(APPEND mudnn_cu_cc_srcs ${MUDNN_FILE}.cu.cc)
+      endif()
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MUDNN_FILE}.cu)
+        list(APPEND mudnn_cu_srcs ${MUDNN_FILE}.cu)
+      endif()
+    endif()    
     if(WITH_MKLDNN)
       string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
       if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc)
@@ -267,6 +305,14 @@ function(op_library TARGET)
         list(APPEND miopen_cu_cc_srcs ${src})
       elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$")
         list(APPEND hip_cc_srcs ${src})
+      elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu$")
+        list(APPEND mudnn_cu_srcs ${src})
+      elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu$")
+        list(APPEND mu_srcs ${src})
+      elseif(WITH_MUSA AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
+        list(APPEND mudnn_cu_cc_srcs ${src})
+      elseif(WITH_MUSA AND ${src} MATCHES ".*\\.cu.cc$")
+        list(APPEND mu_cc_srcs ${src})
       elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$")
         list(APPEND cudnn_cu_srcs ${src})
       elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu$")
@@ -285,13 +331,15 @@ function(op_library TARGET)
         list(APPEND xpu_kp_cc_srcs ${src})
       elseif(${src} MATCHES ".*\\.cc$")
         list(APPEND cc_srcs ${src})
-      elseif((WITH_ROCM OR WITH_GPU) AND ${src} MATCHES ".*\\.kps$")
+      elseif((WITH_ROCM OR WITH_GPU OR WITH_MUSA) AND ${src} MATCHES ".*\\.kps$")
         string(REPLACE ".kps" ".cu" src_cu ${src})
         file(COPY ${src} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
         file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${src}
              ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
         if(WITH_ROCM)
           list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
+        elseif(WITH_MUSA)
+          list(APPEND mu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
         else()
           list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${src_cu})
         endif()
@@ -391,6 +439,25 @@ function(op_library TARGET)
       SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs}
            ${mkldnn_cc_srcs} ${hip_srcs}
       DEPS ${op_library_DEPS} ${op_common_deps})
+  elseif(WITH_MUSA)
+    list(REMOVE_ITEM mudnn_cu_cc_srcs "affine_grid_cudnn_op.cu.cc")
+    list(REMOVE_ITEM mudnn_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc")
+    list(REMOVE_ITEM mu_srcs "cholesky_op.cu")
+    list(REMOVE_ITEM mu_srcs "cholesky_solve_op.cu")
+    list(REMOVE_ITEM mu_srcs "lu_op.cu")
+    list(REMOVE_ITEM mu_srcs "matrix_rank_op.cu")
+    list(REMOVE_ITEM mu_srcs "svd_op.cu")
+    list(REMOVE_ITEM mu_srcs "eigvalsh_op.cu")
+    list(REMOVE_ITEM mu_srcs "qr_op.cu")
+    list(REMOVE_ITEM mu_srcs "eigh_op.cu")
+    list(REMOVE_ITEM mu_srcs "lstsq_op.cu")
+    list(REMOVE_ITEM mu_srcs "multinomial_op.cu")
+    list(REMOVE_ITEM mu_srcs "multiclass_nms3_op.cu")
+    musa_library(
+      ${TARGET}
+      SRCS ${cc_srcs} ${mu_cc_srcs} ${mudnn_cu_cc_srcs} ${mudnn_cu_srcs}
+           ${mkldnn_cc_srcs} ${mu_srcs}
+      DEPS ${op_library_DEPS} ${op_common_deps})      
   elseif(WITH_XPU_KP AND ${xpu_kp_cc_srcs_len} GREATER 0)
     xpu_library(
       ${TARGET}
@@ -540,8 +607,10 @@ function(op_library TARGET)
   list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs})
   list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs})
   list(APPEND cudnn_cu_srcs ${miopen_cu_srcs})
+  list(APPEND cudnn_cu_srcs ${mudnn_cu_cc_srcs})
+  list(APPEND cudnn_cu_srcs ${mudnn_cu_srcs})  
   list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
-  #message("cudnn_cu_srcs ${cudnn_cu_srcs}")
+  message("cudnn_cu_srcs ${cudnn_cu_srcs}")
   if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL
                                         "activation_op")
     file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index ead66697ef68cb..499cc4c591bbfc 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -104,7 +104,7 @@ function(kernel_declare TARGET_LIST)
         endif()
       endif()
       # some gpu kernel only can run on cuda, not support rocm, so we add this branch
-      if(WITH_ROCM)
+      if(WITH_ROCM OR WITH_MUSA)
         string(FIND "${first_registry}" "cuda_only" pos)
         if(pos GREATER 1)
           set(first_registry "")
diff --git a/paddle/common/array.h b/paddle/common/array.h
index 11457a1eaa756b..20f7904fc3bd19 100644
--- a/paddle/common/array.h
+++ b/paddle/common/array.h
@@ -54,7 +54,7 @@ class Array {
   }
 
   HOSTDEVICE inline T &at(size_t i) {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__)
     COMMON_ENFORCE_LT(
         i, N, common::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -62,7 +62,7 @@ class Array {
   }
 
   HOSTDEVICE inline const T &at(size_t i) const {
-#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)
+#if !defined(__CUDA_ARCH__) && !defined(__HIPCC__)&& !defined(__MUSACC__)
     COMMON_ENFORCE_LT(
         i, N, common::errors::OutOfRange("Array index out of bounds."));
 #endif
@@ -103,7 +103,7 @@ class Array<T, 0> {
   HOSTDEVICE inline T *GetMutable() { return nullptr; }
 
   HOSTDEVICE inline T &operator[](size_t) {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__CUDA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static T obj{};
@@ -114,7 +114,7 @@ class Array<T, 0> {
   }
 
   HOSTDEVICE inline const T &operator[](size_t) const {
-#if defined(__HIPCC__) || defined(__CUDA_ARCH__)
+#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__CUDA_ARCH__)
     // HIP and CUDA will have compile error, if use "obj()"
     // function declared in block scope cannot have 'static' storage class
     static const T obj{};
diff --git a/paddle/common/hostdevice.h b/paddle/common/hostdevice.h
index 7f8cf135634341..f7070893d83b58 100644
--- a/paddle/common/hostdevice.h
+++ b/paddle/common/hostdevice.h
@@ -18,6 +18,10 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef __MUSACC__
+#include <musa_runtime.h>
+#endif
+
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
@@ -26,7 +30,7 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__MUSACC__) || defined(__xpu__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/common/macros.h b/paddle/common/macros.h
index 2d476c58cb6ae1..8189b3147db8cc 100644
--- a/paddle/common/macros.h
+++ b/paddle/common/macros.h
@@ -72,7 +72,7 @@ namespace common {
 #define PD_CONCATENATE2(arg1, arg2) arg1##arg2
 #define PD_EXPAND(x) x
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #define PADDLE_RESTRICT __restrict__
 #else
 #define PADDLE_RESTRICT
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index d42b810972dc85..dd6309f7da3608 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -15,7 +15,7 @@ if(WITH_DISTRIBUTE)
     DEPS phi common eager_api gloo_wrapper)
 endif()
 
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   cc_library(
     process_group_nccl
     SRCS process_group_nccl.cc common.cc
@@ -63,7 +63,7 @@ if(WITH_CUSTOM_DEVICE)
 endif()
 
 set(COMM_UTILS_DEPS process_group)
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   set(COMM_UTILS_DEPS ${PROCESS_GROUP_UTILS_DEPS} process_group_nccl)
 endif()
 if(WITH_CUSTOM_DEVICE)
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.cc b/paddle/fluid/distributed/collective/process_group_nccl.cc
index 6732ea375d500e..dd3e1f410ee0d2 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.cc
+++ b/paddle/fluid/distributed/collective/process_group_nccl.cc
@@ -106,6 +106,8 @@ bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) {
     // If we use the work to do barrier, we should block cpu
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
@@ -137,18 +139,20 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
 }
 
 void ProcessGroupNCCL::GroupStart() {
-  NCCL_CHECK(phi::dynload::ncclGroupStart());
+  MCCL_CHECK(phi::dynload::mcclGroupStart());
   ++s_group_call_counter;
 }
 
 void ProcessGroupNCCL::GroupEnd() {
-  NCCL_CHECK(phi::dynload::ncclGroupEnd());
+  MCCL_CHECK(phi::dynload::mcclGroupEnd());
   --s_group_call_counter;
   // NOTE: This is to sync the calc stream and comm stream for debug using
   // batch_isend_irecv
   if (FLAGS_benchmark || FLAGS_benchmark_nccl) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
@@ -179,7 +183,7 @@ phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
   }
 }
 
-ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
+mcclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
   const std::string& key = GetKeyFromPlace(place);
   const auto& iter = place_to_comm_ctx_.find(key);
   PADDLE_ENFORCE_NE(
@@ -204,7 +208,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
       numel > 0 ? GetPartialTensor(tensor_tmp, offset, numel) : tensor_tmp;
   return Collective(
       [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) {
-        VLOG(3) << "[ncclAllGather] "
+        VLOG(3) << "[mcclAllGather] "
                 << "sendbuff: " << in_tensor_maybe_partial.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << in_tensor_maybe_partial.numel()
@@ -235,7 +239,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllReduce(
       paddle::experimental::CheckAndTrans2NewContiguousTensor(in_tensor);
   return Collective(
       [&](phi::distributed::NCCLCommContext* comm_context, gpuStream_t stream) {
-        VLOG(3) << "[ncclAllReduce] "
+        VLOG(3) << "[mcclAllReduce] "
                 << "sendbuff: " << tensor_tmp.data()
                 << ", recvbuff: " << out_tensor->data()
                 << ", count: " << tensor_tmp.numel() << ", datatype: "
@@ -704,7 +708,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
           << ", store_key: " << store_key;
 
   for (size_t i = 0; i < s_group_call_counter; ++i) {
-    NCCL_CHECK(phi::dynload::ncclGroupEnd());
+    MCCL_CHECK(phi::dynload::mcclGroupEnd());
   }
 
   bool is_batch_p2p = s_group_call_counter > 0;
@@ -713,13 +717,13 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
   int num_ranks = is_p2p_op ? 2 : GetSize();
   int rank = is_p2p_op ? p2p_rank : GetRank();
 
-  NCCL_CHECK(phi::dynload::ncclGroupStart());
+  MCCL_CHECK(phi::dynload::mcclGroupStart());
 
   phi::distributed::P2POption p2p_opts({is_p2p_op, p2p_rank, num_ranks, rank});
   phi::distributed::CommContextManager::CreateNCCLCommContext(
       store_, store_key, rank_, size_, "", &p2p_opts);
 
-  NCCL_CHECK(phi::dynload::ncclGroupEnd());
+  MCCL_CHECK(phi::dynload::mcclGroupEnd());
 
   auto nccl_comm_ctx = this->GetCommContext(&store_key);
   VLOG(3) << "Get nccl comm: " << nccl_comm_ctx->GetNcclComm()
@@ -747,10 +751,10 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
         phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
         gpu_global_ranks_size);
 
-    NCCL_CHECK(phi::dynload::ncclAllGather(gpu_global_rank->ptr(),
+    MCCL_CHECK(phi::dynload::mcclAllGather(gpu_global_rank->ptr(),
                                            gpu_global_ranks->ptr(),
                                            1,
-                                           ncclInt,
+                                           mcclInt,
                                            nccl_comm_ctx->GetNcclComm(),
                                            comm_ctx->stream()));
 
@@ -783,7 +787,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place,
   place_to_comm_ctx_.emplace(place_key, std::move(comm_ctx));
 
   for (size_t i = 0; i < s_group_call_counter; ++i) {
-    NCCL_CHECK(phi::dynload::ncclGroupStart());
+    MCCL_CHECK(phi::dynload::mcclGroupStart());
   }
 }
 
@@ -878,6 +882,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
   if (FLAGS_benchmark || FLAGS_benchmark_nccl) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
@@ -993,6 +999,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Point2Point(
   if (!is_batch_p2p && (FLAGS_benchmark || FLAGS_benchmark_nccl)) {
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else  // PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
diff --git a/paddle/fluid/distributed/collective/process_group_nccl.h b/paddle/fluid/distributed/collective/process_group_nccl.h
index 22d90370f16afc..8a626d701b3245 100644
--- a/paddle/fluid/distributed/collective/process_group_nccl.h
+++ b/paddle/fluid/distributed/collective/process_group_nccl.h
@@ -175,7 +175,7 @@ class ProcessGroupNCCL final : public ProcessGroupWithStream {
 
   static void GroupEnd();
 
-  ncclComm_t NCCLComm(const Place& place) const;
+  mcclComm_t NCCLComm(const Place& place) const;
 
  private:
   std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(const Place& place,
diff --git a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
index eec697f5239450..9061ce7aeaa068 100644
--- a/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
+++ b/paddle/fluid/distributed/collective/processgroup_comm_utils.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/phi/backends/c_comm_lib.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #if defined(PADDLE_WITH_CUSTOM_DEVICE)
@@ -33,7 +33,7 @@ namespace detail {
 // In principle, the PHI Kernel cannot use the global singleton internally,
 // and the required members need to be passed in from the eucalyptus tree.
 ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
   paddle::distributed::ProcessGroup* pg = nullptr;
   if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
@@ -45,7 +45,7 @@ ccl::CCLComm GetCCLComm(const Place& place, int global_gid) {
   }
 #endif
   if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     return static_cast<paddle::distributed::ProcessGroupNCCL*>(pg)->NCCLComm(
         place);
 #else
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 6165dfc27e38ef..591e083d005a44 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -372,7 +372,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
       paddle::experimental::empty(IntArray({all_length_}), dtype_, place);
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto *default_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
     ConcatTensorsWithType(
@@ -419,7 +419,7 @@ void EagerGroup::ConcatTensors(const platform::Place &place) {
 void EagerGroup::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto &gpu_context = static_cast<const phi::GPUContext &>(context);
     SplitTensorsWithType(
         gpu_context, &dense_contents_, &dense_tensors_, dtype_);
@@ -1112,7 +1112,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
   auto *dev_ctx =
       platform::DeviceContextPool::Instance().Get(inner_place_);  // NOLINT
   if (platform::is_gpu_place(inner_place_)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     dev_ctx = static_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(inner_place_));
 #else
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 82a3514f2791f9..c896786c657f61 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -277,7 +277,7 @@ static std::shared_ptr<framework::GarbageCollector> GetGC(
   int64_t max_memory_size = framework::GetEagerDeletionThreshold();
   std::shared_ptr<framework::GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place)) {
       if (framework::IsFastEagerDeletionModeEnabled()) {
         gc.reset(new framework::UnsafeFastGPUGarbageCollector(place,
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
index 704dd16400065c..61e0732f89f5bc 100644
--- a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -71,7 +71,7 @@ bool CondInterceptor::GetCondResult() {
   const auto& cond_tensor = cond_var->Get<phi::DenseTensor>();
   bool res = false;
   if (platform::is_gpu_place(cond_tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     phi::DenseTensor cpu_tensor;
     framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor);
     platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait();
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index a1fd38295319ed..0117a472ef06d3 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -76,7 +76,7 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
                 input_data.data.length());
   } else if (platform::is_gpu_place(place)) {
     VLOG(3) << "Loading data for GPU.";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = dynamic_cast<const phi::GPUContext *>(pool.Get(place));
     auto gpu_place = place;
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index b5786e23933930..6dc9cff9d9120b 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -51,7 +51,7 @@ void MessageBus::Init(
                           addr_));
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   // NOTE: To make the brpc is compatible with collective,
   // need release the handler holding the ip address.
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index a6bb716e6b7ade..bef2878e706f55 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -27,6 +27,10 @@ if(WITH_ROCM)
   target_link_libraries(eager_generator ${ROCM_HIPRTC_LIB})
 endif()
 
+if(WITH_MUSA)
+  target_link_libraries(eager_generator ${MUSARTC_LIB})
+endif()
+
 if(WITH_CINN)
   target_link_libraries(eager_generator ${PYTHON_LIBRARIES})
 endif()
diff --git a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
index daf16f446ab12c..f93f41a21553a3 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/python_c_gen.py
@@ -146,7 +146,7 @@ def FindParsingFunctionFromAttributeType(atype):
 FUNCTION_SET_DEVICE_TEMPLATE = """{}
     SetPythonStack();
     if (paddle::platform::is_gpu_place(place)) {{
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::backends::gpu::SetDeviceId(place.device);
       VLOG(4) <<"CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId() << " from " << (int)place.device;
 #else
diff --git a/paddle/fluid/eager/nan_inf_utils.cc b/paddle/fluid/eager/nan_inf_utils.cc
index a1e62ea6ba519b..2da9994b7671ce 100644
--- a/paddle/fluid/eager/nan_inf_utils.cc
+++ b/paddle/fluid/eager/nan_inf_utils.cc
@@ -103,7 +103,7 @@ void CheckTensorHasNanOrInf(const std::string& api_name, const Tensor& tensor) {
 
     auto& place = dense_tensor->place();
     if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       paddle::framework::details::tensor_check<phi::GPUContext>(
           api_name, tensor_name, *dense_tensor, place);
 #else
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 8aa03e98809fb2..8aab6bf2a201ab 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -213,6 +213,11 @@ elseif(WITH_ROCM)
     data_type_transform
     SRCS data_type_transform.cu
     DEPS tensor)
+elseif(WITH_MUSA)
+  musa_library(
+    data_type_transform
+    SRCS data_type_transform.cu
+    DEPS tensor)    
 elseif(WITH_XPU)
   cc_library(
     data_type_transform
@@ -461,7 +466,7 @@ if(WITH_PYTHON)
               ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    if(NOT WITH_ROCM)
+    if(WITH_GPU)
       add_custom_target(
         fleet_executor_proto_init ALL
         DEPENDS fleet_proto_init fleet_executor_desc_py_proto
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index 1620c99ce8560d..6621b74740f250 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -45,6 +45,19 @@ class ConvSearchCache {
   AlgorithmsCache<miopenConvFwdAlgorithm_t>* GetConvFusion() {
     return &fusion_forward_cache_;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  // AlgorithmsCache<mudnnConvFwdAlgorithm_t>* GetForward() {
+  //   return &forward_cache_;
+  // }
+  // AlgorithmsCache<mudnnConvBwdDataAlgorithm_t>* GetBackwardData() {
+  //   return &backward_data_cache_;
+  // }
+  // AlgorithmsCache<mudnnConvBwdWeightsAlgorithm_t>* GetBackwardFilter() {
+  //   return &backward_filter_cache_;
+  // }
+  // AlgorithmsCache<mudnnConvFwdAlgorithm_t>* GetConvFusion() {
+  //   return &fusion_forward_cache_;
+  // }
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetForward() {
     return &forward_cache_;
@@ -72,6 +85,11 @@ class ConvSearchCache {
   AlgorithmsCache<miopenConvBwdDataAlgorithm_t> backward_data_cache_;
   AlgorithmsCache<miopenConvBwdWeightsAlgorithm_t> backward_filter_cache_;
   AlgorithmsCache<miopenConvFwdAlgorithm_t> fusion_forward_cache_;
+#elif defined(PADDLE_WITH_MUSA)
+  // AlgorithmsCache<mudnnConvFwdAlgorithm_t> forward_cache_;
+  // AlgorithmsCache<mudnnConvBwdDataAlgorithm_t> backward_data_cache_;
+  // AlgorithmsCache<mudnnConvBwdWeightsAlgorithm_t> backward_filter_cache_;
+  // AlgorithmsCache<mudnnConvFwdAlgorithm_t> fusion_forward_cache_;
 #else
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
   AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index bf2f9e4379b693..4d2236ed1e66f7 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -124,7 +124,7 @@ static void RunKernelFunc(
                 "Input tensor (%s) is not initialized.", in_name));
         paddle::Tensor custom_in;
         custom_in.set_impl(std::make_shared<phi::DenseTensor>(*x));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         if (custom_in.is_gpu_pinned()) {
           VLOG(3) << "Custom Operator: custom input is gpu pinned tensor";
           auto gpu_place = phi::GPUPlace(platform::GetCurrentDeviceId());
@@ -936,7 +936,7 @@ static void RegisterOperatorKernel(
   }
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   RegisterOperatorKernelWithPlace(
       name, op_kernel_func, proto::VarType::RAW, platform::CUDAPlace());
 #endif
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index 4a72f339a85cbc..d3525c80d56db2 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -271,6 +271,8 @@ void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) {
     cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_HIP)
     hipMemcpy(dst, src, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(dst, src, size, musaMemcpyHostToDevice);
 #elif defined(PADDLE_WITH_XPU_KP)
     xpu_memcpy(dst, src, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
 #else
@@ -1529,7 +1531,7 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
 #endif
 }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 template <typename T>
 void PrivateInstantDataFeed<T>::PutToFeedVec() {
   for (size_t i = 0; i < use_slots_.size(); ++i) {
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
index 156c70b9825382..57cf488d2a3014 100644
--- a/paddle/fluid/framework/data_feed.cu
+++ b/paddle/fluid/framework/data_feed.cu
@@ -2982,7 +2982,7 @@ std::shared_ptr<phi::Allocation> GetNodeDegree(
 }
 
 int multi_node_sync_sample(int flag,
-                           const ncclRedOp_t &op,
+                           const mcclRedOp_t &op,
                            const paddle::platform::Place &place,
                            const int gpu_id,
                            phi::DenseTensor *multi_node_sync_stat_ptr) {
@@ -2998,8 +2998,8 @@ int multi_node_sync_sample(int flag,
   int *stat_ptr = multi_node_sync_stat_ptr->data<int>();
   auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId());
   auto stream = comm->stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-      &stat_ptr[flag], &stat_ptr[3], 1, ncclInt, op, comm->comm(), stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+      &stat_ptr[flag], &stat_ptr[3], 1, mcclInt, op, comm->comm(), stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret,  // output
                                              &stat_ptr[3],
                                              sizeof(int),
@@ -3011,7 +3011,7 @@ int multi_node_sync_sample(int flag,
 }
 
 int get_multi_node_global_flag(int local_flag,
-                               const ncclRedOp_t &op,
+                               const mcclRedOp_t &op,
                                const paddle::platform::Place &place,
                                const int gpu_id,
                                cudaStream_t stream) {
@@ -3025,10 +3025,10 @@ int get_multi_node_global_flag(int local_flag,
       send_buff_ptr, &local_flag, sizeof(int), cudaMemcpyHostToDevice, stream);
   cudaStreamSynchronize(stream);
   auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0],
                                                               &send_buff_ptr[1],
                                                               1,
-                                                              ncclInt,
+                                                              mcclInt,
                                                               op,
                                                               comm->comm(),
                                                               stream));
@@ -3177,7 +3177,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
     // to decide whether to continue sampling
     if (FLAGS_enable_graph_multi_node_sampling) {
       switch_command = multi_node_sync_sample(
-          switch_flag, ncclProd, place, conf.gpuid, multi_node_sync_stat_ptr);
+          switch_flag, mcclProd, place, conf.gpuid, multi_node_sync_stat_ptr);
       VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync"
               << " switch_flag:" << switch_flag << "," << switch_command;
       if (switch_command) {
@@ -3187,7 +3187,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
       }
 
       sample_command = multi_node_sync_sample(
-          sample_flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
+          sample_flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
       VLOG(2) << "gpuid:" << conf.gpuid << " multi node sample sync"
               << " sample_flag:" << sample_flag << "," << sample_command;
       if (sample_command == EVENT_FINISH_EPOCH) {
@@ -3280,7 +3280,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
     if (FLAGS_enable_graph_multi_node_sampling) {
       int flag = *jump_rows_ptr > 0 ? 1 : 0;
       int command = multi_node_sync_sample(
-          flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
+          flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
       VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync"
               << " step:" << step << " step_sample:" << flag << "," << command;
       if (command <= 0) {
@@ -3326,7 +3326,7 @@ int FillWalkBuf(const std::vector<uint64_t> &h_device_keys_len,
         // Step synchronization for multi-step sampling in multi node
         int flag = sample_res.total_sample_size > 0 ? 1 : 0;
         int command = multi_node_sync_sample(
-            flag, ncclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
+            flag, mcclMax, place, conf.gpuid, multi_node_sync_stat_ptr);
         VLOG(2) << "gpuid:" << conf.gpuid << " multi node step sync"
                 << " step:" << step << " step_sample:" << flag << ","
                 << command;
@@ -3846,7 +3846,7 @@ void GraphDataGenerator::DoWalkandSage() {
     } else {
       if (conf_.sage_mode) {
         global_train_flag_ = get_multi_node_global_flag(
-            local_train_flag, ncclProd, place_, conf_.gpuid, sample_stream_);
+            local_train_flag, mcclProd, place_, conf_.gpuid, sample_stream_);
         VLOG(1) << "gpu_id: " << conf_.gpuid
                 << ", local_train_flag: " << local_train_flag
                 << ", global_train_flag: " << global_train_flag_;
@@ -4010,7 +4010,7 @@ void GraphDataGenerator::DoSageForTrain() {
       // check whether reach sage pass end
       if (conf_.is_multi_node) {
         int res = multi_node_sync_sample(sage_pass_end,
-                                         ncclProd,
+                                         mcclProd,
                                          place_,
                                          conf_.gpuid,
                                          &multi_node_sync_stat_);
@@ -4165,7 +4165,7 @@ void GraphDataGenerator::DoSageForInfer() {
       int local_pass_end = total_instance == 0;
       if (conf_.is_multi_node) {
         global_pass_end = get_multi_node_global_flag(
-            local_pass_end, ncclProd, place_, conf_.gpuid, sample_stream_);
+            local_pass_end, mcclProd, place_, conf_.gpuid, sample_stream_);
       } else {
         global_pass_end = local_pass_end;
       }
@@ -4261,11 +4261,11 @@ int dynamic_adjust_total_row_for_infer(int local_reach_end,
                   stream);
   cudaStreamSynchronize(stream);
   auto comm = platform::NCCLCommContext::Instance().Get(0, place.GetDeviceId());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0],
                                                               &send_buff_ptr[1],
                                                               1,
-                                                              ncclInt,
-                                                              ncclProd,
+                                                              mcclInt,
+                                                              mcclProd,
                                                               comm->comm(),
                                                               stream));
   int global_reach_end = 0;
@@ -4356,7 +4356,7 @@ bool FillInferBuf(
           global_infer_node_type_start[infer_cursor] + conf.buf_size >=
           device_key_size;
       int global_reach_end = get_multi_node_global_flag(
-          local_reach_end, ncclProd, place, conf.gpuid, stream);
+          local_reach_end, mcclProd, place, conf.gpuid, stream);
       int remain = device_key_size - global_infer_node_type_start[infer_cursor];
       if (global_reach_end) {
         *total_row_ptr = remain;
@@ -5005,11 +5005,11 @@ int GraphDataGenerator::dynamic_adjust_batch_num_for_sage() {
   cudaStreamSynchronize(sample_stream_);
   auto comm =
       platform::NCCLCommContext::Instance().Get(0, place_.GetDeviceId());
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&send_buff_ptr[0],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&send_buff_ptr[0],
                                                               &send_buff_ptr[1],
                                                               1,
-                                                              ncclInt,
-                                                              ncclMax,
+                                                              mcclInt,
+                                                              mcclMax,
                                                               comm->comm(),
                                                               sample_stream_));
   int thread_max_batch_num = 0;
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 243c5c818f5887..492c7629abf9eb 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -2023,7 +2023,7 @@ class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed {
   int pv_batch_size_;
 };
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 template <typename T>
 class PrivateInstantDataFeed : public DataFeed {
  public:
diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc
index 88afa021b7c1b9..010661fef6e8ab 100644
--- a/paddle/fluid/framework/data_feed_factory.cc
+++ b/paddle/fluid/framework/data_feed_factory.cc
@@ -70,7 +70,7 @@ REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
 REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed);
 REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed);
 REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed);
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 9d114fcf563963..b2fb089f535749 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -101,7 +101,7 @@ struct CastDataType {
             in_end,
             out_begin,
             CastDataTypeFunctor<InType, OutType>());
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
     } else if (platform::is_gpu_place(in_.place())) {
       phi::Transform<phi::GPUContext> trans;
       auto* context = static_cast<const phi::GPUContext*>(ctx_);
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index f0c2b60f41b69d..f43c20a0d3a94c 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -221,6 +221,75 @@ elseif(WITH_ROCM)
     fused_broadcast_op_handle
     SRCS fused_broadcast_op_handle.cc
     DEPS broadcast_op_handle)
+elseif(WITH_MUSA)
+  musa_library(
+    nan_inf_utils
+    SRCS nan_inf_utils_detail.cc
+    DEPS framework_proto scope place phi common)
+  musa_library(
+    all_reduce_op_handle
+    SRCS all_reduce_op_handle.cc
+    DEPS op_handle_base
+         scope
+         lod_tensor
+         phi
+         common
+         memory
+         dynload_cuda
+         variable_visitor)
+  musa_library(
+    fused_all_reduce_op_handle
+    SRCS fused_all_reduce_op_handle.cc
+    DEPS all_reduce_op_handle
+         op_handle_base
+         variable_visitor
+         scope
+         lod_tensor
+         phi
+         common
+         memory
+         dynload_cuda
+         place)
+  musa_library(
+    grad_merge_all_reduce_op_handle
+    SRCS grad_merge_all_reduce_op_handle.cc
+    DEPS fused_all_reduce_op_handle
+         op_handle_base
+         scope
+         lod_tensor
+         phi
+         common
+         memory
+         dynload_cuda
+         variable_visitor
+         place
+         all_reduce_op_handle)
+
+  if(WITH_DISTRIBUTE)
+    musa_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
+  else()
+    musa_library(
+      reduce_op_handle
+      SRCS reduce_op_handle.cc
+      DEPS op_handle_base variable_visitor scope phi common dynload_cuda)
+  endif()
+  musa_library(
+    broadcast_op_handle
+    SRCS broadcast_op_handle.cc
+    DEPS op_handle_base
+         scope
+         phi
+         common
+         memory
+         variable_visitor
+         dynload_cuda)
+  musa_library(
+    fused_broadcast_op_handle
+    SRCS fused_broadcast_op_handle.cc
+    DEPS broadcast_op_handle)    
 else()
   cc_library(
     nan_inf_utils
@@ -420,7 +489,7 @@ endif()
 
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM))
+   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
   set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass)
 endif()
 cc_library(
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index b064a2aded0bcb..087a629d493444 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -28,7 +28,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
@@ -207,17 +207,17 @@ void AllReduceOpHandle::AllReduceFunc(
     const std::vector<platform::Place> &places,
     const std::vector<std::string> &out_var_names) {
   if (platform::is_gpu_place(places[0])) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_,
                             platform::errors::InvalidArgument(
                                 "The nccl context should not be NULL."));
-    ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
+    mcclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype);
     std::vector<std::function<void()>> all_reduce_calls;
     for (size_t i = 0; i < local_exec_scopes_.size(); ++i) {
       auto &p = places[i];
       void *buffer = const_cast<void *>(lod_tensor_data.at(i));
       all_reduce_calls.emplace_back([=] {
-        NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum);
+        NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, mcclSum);
       });
     }
     NCCLAllReduceFunc(all_reduce_calls);
@@ -300,7 +300,7 @@ void AllReduceOpHandle::SyncBKCLAllReduce() {
 }
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 void AllReduceOpHandle::NCCLAllReduceFunc(
     const std::vector<std::function<void()>> &all_reduce_calls) {
   this->RunAndRecordEvent([&] {
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index 685ab0b957a448..0e2c06311bf385 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -31,7 +31,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -43,7 +43,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 class AllReduceOpHandle : public NCCLOpHandleBase {
  public:
   AllReduceOpHandle(ir::Node *node,
@@ -77,14 +77,14 @@ class AllReduceOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
 
-#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && \
+#if !defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL) && \
     !defined(PADDLE_WITH_XPU_BKCL)
   // NCCLOpHandleBase and BKCLOpHandleBase already have these attributes.
   // Will polish it by class inheritance framework.
   std::vector<platform::Place> places_;
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   void NCCLAllReduceFunc(
       const std::vector<std::function<void()>> &all_reduce_calls);
 
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc
index b79eff24ee87d7..98672d09a2452e 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -88,7 +88,7 @@ void BroadcastOpHandle::BroadcastOneVar(
       });
     }
   } else if (platform::is_gpu_place(in_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     VarHandle *out_handle = nullptr;
     int root_id = in_tensor.place().device;  // NOLINT
     std::vector<std::function<void()>> broadcast_calls;
@@ -118,9 +118,9 @@ void BroadcastOpHandle::BroadcastOneVar(
       broadcast_calls.emplace_back(
           [send_recv_buffer, numel, type, root_id, &nccl_ctx] {
             PADDLE_ENFORCE_GPU_SUCCESS(
-                platform::dynload::ncclBcast(send_recv_buffer,
+                platform::dynload::mcclBcast(send_recv_buffer,
                                              numel,
-                                             static_cast<ncclDataType_t>(type),
+                                             static_cast<mcclDataType_t>(type),
                                              root_id,
                                              nccl_ctx.comm_,
                                              nccl_ctx.stream()));
diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h
index 9fbe2764913b55..3300c48b165853 100644
--- a/paddle/fluid/framework/details/broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/broadcast_op_handle.h
@@ -34,7 +34,7 @@ class Node;
 }  // namespace ir
 }  // namespace framework
 namespace platform {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 struct NCCLContextMap;
 #endif
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -43,7 +43,7 @@ struct BKCLContextMap;
 }  // namespace platform
 }  // namespace paddle
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
@@ -55,7 +55,7 @@ namespace details {
 
 struct BroadcastOpHandle : public OpHandleBase {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   BroadcastOpHandle(ir::Node *node,
                     const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
@@ -109,7 +109,7 @@ struct BroadcastOpHandle : public OpHandleBase {
 
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   const platform::NCCLContextMap *nccl_ctxs_;
 #elif defined(PADDLE_WITH_XPU_BKCL)
   const platform::BKCLContextMap *bkcl_ctxs_;
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 5a6f4e6e70d4c1..5b8857977c9fab 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -186,7 +186,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                         "fuse_relu_depthwise_conv_pass");
     AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
     AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \
     !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #endif
@@ -348,7 +348,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                                 const std::string &loss_var_name,
                                 const std::vector<Scope *> &local_scopes,
                                 const size_t &nranks,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
                                 DeviceType use_device,
                                 platform::NCCLCommunicator *nccl_ctxs) const {
 #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
@@ -380,7 +380,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Erase(kNRanks);
       pass->Set<size_t>(kNRanks, new size_t(nranks));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       platform::NCCLCommunicator *nctx =
           (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
@@ -400,7 +400,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       pass->Erase(kLocalScopes);
       pass->SetNotOwned<const std::vector<Scope *>>(kLocalScopes,
                                                     &local_scopes);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       platform::NCCLCommunicator *nctx =
           (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
@@ -428,7 +428,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
       LOG(INFO) << "set enable_sequential_execution:"
                 << enable_sequential_execution_;
     } else if (pass->Type() == "all_reduce_deps_pass") {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       platform::NCCLCommunicator *nctx =
           (use_device == p::kCUDA) ? nccl_ctxs : nullptr;
       pass->Erase(kNCCLCtxs);
@@ -545,7 +545,7 @@ USE_PASS(fused_feedforward_pass);
 #ifdef PADDLE_WITH_DNNL
 USE_PASS(mkldnn_placement_pass);
 #endif
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MCCL)) && \
     !defined(_WIN32) && !defined(__APPLE__)
 USE_PASS(fusion_group_pass);
 #endif
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 203525d5a74821..90cf7fe82ebfd2 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -217,7 +217,7 @@ struct BuildStrategy {
                    const std::string &loss_var_name,
                    const std::vector<Scope *> &local_scopes,
                    const size_t &nranks,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL)
                    DeviceType use_device,
                    platform::NCCLCommunicator *nccl_ctxs) const;
 #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
index 4012263f688cb5..89d72a1b8213a5 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include <algorithm>
@@ -44,7 +44,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
       place_(place),
       var_infos_(vars.begin(), vars.end()),
       gc_(gc) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     dev_ctx_ = reinterpret_cast<phi::GPUContext *>(
         platform::DeviceContextPool::Instance().Get(place));
@@ -53,6 +53,9 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(&event_, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event_, cudaEventDisableTiming));
@@ -75,12 +78,14 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
 }
 
 EagerDeletionOpHandle::~EagerDeletionOpHandle() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto gpu_place = dev_ctx_->GetPlace();
     platform::CUDADeviceGuard guard(gpu_place.device);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -89,7 +94,7 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
 }
 
 void EagerDeletionOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
@@ -177,7 +182,7 @@ void EagerDeletionOpHandle::RunImpl() {
 
 void EagerDeletionOpHandle::ClearGarbages(
     std::deque<std::shared_ptr<memory::Allocation>> *garbages) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (event_) {
     auto compute_stream = dev_ctx_->stream();
     auto callback_stream =
@@ -187,6 +192,10 @@ void EagerDeletionOpHandle::ClearGarbages(
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamWaitEvent(callback_stream, event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, compute_stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamWaitEvent(callback_stream, event_, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, compute_stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
@@ -197,7 +206,7 @@ void EagerDeletionOpHandle::ClearGarbages(
   } else {
 #endif
     gc_->Add(std::move(*garbages));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 }
diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h
index 0a92269c50ad2d..049b0c2ec478b4 100644
--- a/paddle/fluid/framework/details/eager_deletion_op_handle.h
+++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h
@@ -80,7 +80,7 @@ class EagerDeletionOpHandle : public OpHandleBase {
   std::vector<ir::MemOptVarInfo *> var_infos_;  // not own
   GarbageCollector *gc_;                        // not own
   std::vector<Variable *> vars_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   phi::GPUContext *dev_ctx_{nullptr};
   gpuEvent_t event_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index ee78d366711075..be3b196c3ca6ca 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -135,7 +135,7 @@ static void TransData(const phi::DenseTensor *src_item,
                       const platform::DeviceContext &ctx) {
   if (src_item->IsInitialized() && src_item->numel() > 0) {
     if (platform::is_gpu_place(src_item->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       TensorCopy(*src_item, platform::CUDAPinnedPlace(), ctx, dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 27be4b77176350..0ab7767aca0bac 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -121,7 +121,7 @@ static void TransData(const phi::DenseTensor &src_item,
                       phi::DenseTensor *dst_item) {
   if (src_item.IsInitialized() && src_item.numel() > 0) {
     if (platform::is_gpu_place(src_item.place())) {  // NOLINT
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       TensorCopy(src_item, platform::CPUPlace(), dst_item);
 #endif
     } else {
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
index 53746482d58a80..b1db6b334013d3 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
@@ -32,7 +32,7 @@ typedef std::vector<
     std::vector<std::pair<std::string, const phi::DenseTensor *>>>
     GradientAndLoDTensor;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 FusedAllReduceOpHandle::FusedAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
@@ -61,11 +61,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
 #endif
 
 FusedAllReduceOpHandle::~FusedAllReduceOpHandle() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto destroy_event = [](gpuEvent_t event) {
     if (event == nullptr) return;
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
@@ -80,7 +82,7 @@ void FusedAllReduceOpHandle::RunImpl() {
       Name(), platform::TracerEventType::Communication, 1);
   VLOG(4) << this->DebugString();
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) {
     VLOG(10) << "FLAGS_allreduce_record_one_event=true";
     PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_,
@@ -103,6 +105,9 @@ void FusedAllReduceOpHandle::RunImpl() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(event, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(event, cudaEventDisableTiming));
@@ -126,6 +131,10 @@ void FusedAllReduceOpHandle::RunImpl() {
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(nccl_stream, start_event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(start_event_, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamWaitEvent(nccl_stream, start_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event_, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -185,12 +194,16 @@ void FusedAllReduceOpHandle::RunImpl() {
     FusedAllReduceFunc(in_var_handles, out_var_handles);
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (FLAGS_allreduce_record_one_event) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamWaitEvent(compute_stream, end_event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(end_event_, nccl_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamWaitEvent(compute_stream, end_event_, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(end_event_, nccl_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(
diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
index 533d1d0860a553..a5c6c431f1742e 100644
--- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h
@@ -33,7 +33,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -44,7 +44,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 struct FusedAllReduceOpHandle : public AllReduceOpHandle {
   FusedAllReduceOpHandle(ir::Node *node,
                          const std::vector<Scope *> &local_scopes,
@@ -75,7 +75,7 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle {
  private:
   size_t num_of_all_reduce_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   gpuEvent_t start_event_{nullptr};
   gpuEvent_t end_event_{nullptr};
 #endif
diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
index 6ba6df7011ade6..198fb8b6eb07e6 100644
--- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h
+++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h
@@ -36,7 +36,7 @@ struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
@@ -46,7 +46,7 @@ namespace details {
 
 struct FusedBroadcastOpHandle : public BroadcastOpHandle {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   FusedBroadcastOpHandle(ir::Node *node,
                          const std::vector<Scope *> local_scopes,
                          const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
index 15648aa058f073..2ebaa31f53bd89 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.cc
@@ -16,7 +16,7 @@
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -24,7 +24,7 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 GradMergeAllReduceOpHandle::GradMergeAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
@@ -77,7 +77,7 @@ std::string GradMergeAllReduceOpHandle::Name() const {
   return "grad_merge_all_reduce";
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 FusedGradMergeAllReduceOpHandle::FusedGradMergeAllReduceOpHandle(
     ir::Node *node,
     const std::vector<Scope *> &local_scopes,
diff --git a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
index ce01f85eaba52a..5e8d061762cbc8 100644
--- a/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/grad_merge_all_reduce_op_handle.h
@@ -33,7 +33,7 @@ namespace platform {
 class NCCLCommunicator;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -44,7 +44,7 @@ namespace details {
 
 class GradMergeAllReduceOpHandle : public AllReduceOpHandle {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   GradMergeAllReduceOpHandle(ir::Node *node,
                              const std::vector<Scope *> &local_scopes,
                              const std::vector<platform::Place> &places,
@@ -75,7 +75,7 @@ class GradMergeAllReduceOpHandle : public AllReduceOpHandle {
 
 class FusedGradMergeAllReduceOpHandle : public FusedAllReduceOpHandle {
  public:
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   FusedGradMergeAllReduceOpHandle(ir::Node *node,
                                   const std::vector<Scope *> &local_scopes,
                                   const std::vector<platform::Place> &places,
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index 6c3f5356ac1f15..91cb342594a635 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -183,7 +183,7 @@ void CheckVarHasNanOrInf(const std::string& op_type,
            << ", place:" << tensor->place() << ", numel:" << tensor->numel();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     tensor_check<phi::GPUContext>(op_type, var_name, *tensor, place);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h
index e4472e8d989dd2..ab7c4ecd884683 100644
--- a/paddle/fluid/framework/details/nccl_op_handle.h
+++ b/paddle/fluid/framework/details/nccl_op_handle.h
@@ -27,6 +27,9 @@
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/flags.h"
 
@@ -55,6 +58,8 @@ class NCCLOpHandleBase : public OpHandleBase {
     for (auto& ev : inter_events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -62,6 +67,8 @@ class NCCLOpHandleBase : public OpHandleBase {
     for (auto& ev : exter_events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -72,7 +79,7 @@ class NCCLOpHandleBase : public OpHandleBase {
     return nccl_ctxs_;
   }
 
-  ncclComm_t GetComm() const {
+  mcclComm_t GetComm() const {
     PADDLE_ENFORCE_EQ(
         places_.size(),
         1,
@@ -143,6 +150,11 @@ class NCCLOpHandleBase : public OpHandleBase {
           &inter_events_[dev_id], hipEventDisableTiming));
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventCreateWithFlags(
           &exter_events_[dev_id], hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags(
+          &inter_events_[dev_id], musaEventDisableTiming));
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventCreateWithFlags(
+          &exter_events_[dev_id], musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(
           &inter_events_[dev_id], cudaEventDisableTiming));
@@ -159,8 +171,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                          const void* sendbuff,
                          void* recvbuff,
                          size_t count,
-                         ncclDataType_t datatype,
-                         ncclRedOp_t op) {
+                         mcclDataType_t datatype,
+                         mcclRedOp_t op) {
     PADDLE_ENFORCE_GE(
         run_order_,
         0,
@@ -176,7 +188,7 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dev_id:" << dev_id << ", dtype:" << datatype
              << ", place:" << place;
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
   }
 
@@ -184,8 +196,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                      const void* sendbuff,
                      void* recvbuff,
                      size_t count,
-                     ncclDataType_t datatype,
-                     ncclRedOp_t op) {
+                     mcclDataType_t datatype,
+                     mcclRedOp_t op) {
     PADDLE_ENFORCE_GE(
         run_order_,
         0,
@@ -203,8 +215,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                              const void* sendbuff,
                              void* recvbuff,
                              size_t count,
-                             ncclDataType_t datatype,
-                             ncclRedOp_t op) {
+                             mcclDataType_t datatype,
+                             mcclRedOp_t op) {
     PADDLE_ENFORCE_GE(
         run_order_,
         0,
@@ -224,8 +236,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                    const void* sendbuff,
                    void* recvbuff,
                    size_t count,
-                   ncclDataType_t datatype,
-                   ncclRedOp_t op UNUSED) {
+                   mcclDataType_t datatype,
+                   mcclRedOp_t op UNUSED) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
     int dev_id = place.device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
@@ -238,11 +250,13 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", dtype:" << datatype << ", place:" << place
              << ", stream:" << stream;
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
-        sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(
+        sendbuff, recvbuff, count, datatype, mcclSum, 0, comm, stream));
 
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(inter_events_.at(dev_id), stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(inter_events_.at(dev_id), stream);
 #else
     cudaEventRecord(inter_events_.at(dev_id), stream);
 #endif
@@ -256,8 +270,8 @@ class NCCLOpHandleBase : public OpHandleBase {
                       const void* sendbuff,
                       void* recvbuff,
                       size_t count,
-                      ncclDataType_t datatype,
-                      ncclRedOp_t op) {
+                      mcclDataType_t datatype,
+                      mcclRedOp_t op) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_);
     PADDLE_ENFORCE_NOT_NULL(
         nccl_ctxs_,
@@ -276,14 +290,21 @@ class NCCLOpHandleBase : public OpHandleBase {
 #ifdef PADDLE_WITH_HIP
     hipStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     hipEventRecord(exter_events_.at(dev_id), stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+        sendbuff, recvbuff, count, datatype, op, comm, stream));
+
+    musaEventRecord(exter_events_.at(dev_id), stream);
 #else
     cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
         sendbuff, recvbuff, count, datatype, op, comm, stream));
 
     cudaEventRecord(exter_events_.at(dev_id), stream);
@@ -296,8 +317,8 @@ class NCCLOpHandleBase : public OpHandleBase {
   void InterBroadCast(platform::Place place,
                       void* sendbuff,
                       size_t count,
-                      ncclDataType_t datatype,
-                      ncclRedOp_t op UNUSED) {
+                      mcclDataType_t datatype,
+                      mcclRedOp_t op UNUSED) {
     auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_);
     int dev_id = place.device;
     auto& nccl_ctx = nccl_ctxs->at(dev_id);
@@ -310,10 +331,12 @@ class NCCLOpHandleBase : public OpHandleBase {
              << ", stream:" << stream;
 #ifdef PADDLE_WITH_HIP
     hipStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
+#elif defined(PADDLE_WITH_MUSA)
+    musaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
 #else
     cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0);
 #endif
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
         sendbuff, count, datatype, 0, comm, stream));
   }
 
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index ee87141a9d5414..896b251571fc96 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -31,11 +31,13 @@ std::string OpHandleBase::DebugString() const {
 }
 
 OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {  // NOLINT
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   for (auto &ev : events_) {
     if (ev.second) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(ev.second));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(ev.second));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(ev.second));
 #endif
@@ -45,13 +47,16 @@ OpHandleBase::~OpHandleBase() PADDLE_MAY_THROW {  // NOLINT
 }
 
 void OpHandleBase::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
   for (auto &p : dev_ctxes_) {
     int dev_id = p.first.device;  // NOLINT
     platform::SetDeviceId(dev_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&events_[dev_id], hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreateWithFlags(&events_[dev_id], musaEventDisableTiming));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming));
@@ -136,7 +141,7 @@ void OpHandleBase::InitXPU() {
 }
 
 void OpHandleBase::Run(DeviceType use_device) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
   if (events_.empty() && use_device == p::kCUDA && !dev_ctxes_.empty()) {
     InitCUDA();
   }
@@ -172,7 +177,7 @@ void OpHandleBase::Run(DeviceType use_device) {
 }
 
 void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
   PADDLE_ENFORCE_NOT_NULL(
       waited_ctx,
       platform::errors::InvalidArgument("Argument waited_ctx is NULL."));
@@ -188,6 +193,8 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
     for (auto &ev : events_) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream, ev.second, 0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream, ev.second, 0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream, ev.second, 0));
 #endif
@@ -221,12 +228,15 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
       if (in_var_handle) {
         auto &place = in_var_handle->place();
         if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
           auto stream =
               static_cast<phi::GPUContext *>(dev_ctxes_.at(place))->stream();
 #ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));        
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -248,7 +258,7 @@ void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
         if (in_var_handle) {
           auto &place = in_var_handle->place();
           if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             platform::DeviceContextPool &pool =
                 platform::DeviceContextPool::Instance();
             auto stream =
@@ -273,13 +283,16 @@ void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
           auto stream = static_cast<phi::GPUContext *>(
                             dev_ctxes_.at(in_var_handle->place()))
                             ->stream();
 #ifdef PADDLE_WITH_HIP
           PADDLE_ENFORCE_GPU_SUCCESS(
               hipStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
+#elif defined(PADDLE_WITH_MUSA)
+          PADDLE_ENFORCE_GPU_SUCCESS(
+              musaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));        
 #else
           PADDLE_ENFORCE_GPU_SUCCESS(
               cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0));
@@ -311,7 +324,7 @@ bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
 
 void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
   callback();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (!events_.empty()) {  // Use event
     for (auto &p : dev_ctxes_) {
       auto dev_id = p.first.device;
@@ -320,6 +333,9 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));          
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventRecord(events_.at(dev_id), cuda_dev_ctx->stream()));
@@ -331,7 +347,7 @@ void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
 
 void OpHandleBase::RunAndRecordEvent(platform::Place p,
                                      const std::function<void()> &callback) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
   if (platform::is_cpu_place(p) || events_.empty()) {
     callback();
   } else {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index 9afe56e4babd45..4bd385ff5099cb 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -161,7 +161,7 @@ class OpHandleBase {
   // See https://github.com/PaddlePaddle/Paddle/pull/32283
   bool is_variant_scope_ = false;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::unordered_map<int, gpuEvent_t> events_;
 #endif
 
diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc
index fe43126ca8abe4..d7d0a3e2863638 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/reduce_op_handle.cc
@@ -182,7 +182,7 @@ void ReduceOpHandle::RunImpl() {
         }
       });
     } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       auto pre_in = pre_in_var->Get<phi::DenseTensor>();
       VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var);
       VariableVisitor::GetMutableTensor(out_var).mutable_data(
@@ -210,12 +210,12 @@ void ReduceOpHandle::RunImpl() {
         size_t numel = static_cast<size_t>(lod_tensor.numel());
         all_reduce_calls.emplace_back(
             [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] {
-              PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+              PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(
                   buffer,
                   recvbuffer,
                   numel,
-                  static_cast<ncclDataType_t>(type),
-                  ncclSum,
+                  static_cast<mcclDataType_t>(type),
+                  mcclSum,
                   root_id,
                   nccl_ctx.comm_,
                   nccl_ctx.stream()));
diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h
index 2eb0ad29232119..eb0e319cce3b50 100644
--- a/paddle/fluid/framework/details/reduce_op_handle.h
+++ b/paddle/fluid/framework/details/reduce_op_handle.h
@@ -39,7 +39,7 @@ namespace platform {
 struct NCCLContextMap;
 }  // namespace platform
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
@@ -79,7 +79,7 @@ struct ReduceOpHandle : public OpHandleBase {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   const platform::NCCLContextMap *nccl_ctxs_;
   ReduceOpHandle(ir::Node *node,
                  const std::vector<Scope *> &local_scopes,
@@ -129,7 +129,7 @@ struct ReduceOpHandle : public OpHandleBase {
 
   std::vector<Scope *> GetLocalScopes() override { return local_scopes_; }
 
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP) && \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA) && \
     defined PADDLE_WITH_DISTRIBUTE
   template <typename DevCtx, typename DataType>
   void GatherSelectedRows(
diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
index 8b486be9cc686a..f37ea73a477b66 100644
--- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
+++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc
@@ -76,7 +76,7 @@ struct ScaleLossGradFunctor {
           "Please recompile or reinstall Paddle with XPU support."));
 #endif
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       OutT cast_coeff = static_cast<OutT>(coeff_);
       auto stream = static_cast<phi::GPUContext *>(ctx_)->stream();
       memory::Copy(place_,
@@ -110,7 +110,7 @@ void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) {
   auto *tensor = var->GetMutable<phi::DenseTensor>();
   tensor->Resize(common::make_ddim({1}));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   ScaleLossGradFunctor func(
       coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_));
   if (record_event) {
diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
index 02a68fb697efbb..cb16915316ecfe 100644
--- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
+++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc
@@ -95,7 +95,7 @@ void ShareTensorBufferOpHandle::SetShareDimsAndDtype(
 }
 
 void ShareTensorBufferOpHandle::InitCUDA() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   int dev_id = dev_ctxes_.begin()->first.device;
   events_[dev_id] = nullptr;
 #endif
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index ba678bbe2e26be..5c266946144fe0 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -196,7 +196,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
     auto comm = nccl_ctx.comm_;
 
     int encode_size = 2 * k * sizeof(int);
-    // dgc use ncclAllGather to get all the encoded data
+    // dgc use mcclAllGather to get all the encoded data
     // so the buffer need nranks.
     int buf_size = nranks_ * encode_size;
     void *gather_buff = gathers[i]->data();
@@ -207,10 +207,10 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
 
     all_gather_calls.emplace_back([=] {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclAllGather(in_tensor_buf,
+          platform::dynload::mcclAllGather(in_tensor_buf,
                                            gather_buff,
                                            2 * k,
-                                           static_cast<ncclDataType_t>(dtype),
+                                           static_cast<mcclDataType_t>(dtype),
                                            comm,
                                            stream));
     });
diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h
index a6314220d5c264..9a130bea0d3a27 100644
--- a/paddle/fluid/framework/details/var_handle.h
+++ b/paddle/fluid/framework/details/var_handle.h
@@ -129,7 +129,7 @@ struct VarHandle : public VarHandleBase {
         name_(std::move(name)),
         place_(std::move(place)) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   bool HasEvent() { return has_event_; }
 
   const gpuEvent_t& GetEvent() {
@@ -154,7 +154,7 @@ struct VarHandle : public VarHandleBase {
   size_t scope_idx_;
   std::string name_;
   platform::Place place_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Only when this event is triggered, var is generated.
   gpuEvent_t event_;
   bool has_event_{false};
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
index d7714808ff08ac..e448f80ae39388 100644
--- a/paddle/fluid/framework/device_worker.h
+++ b/paddle/fluid/framework/device_worker.h
@@ -53,7 +53,7 @@ class Scope;
 }  // namespace framework
 }  // namespace paddle
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
@@ -85,12 +85,12 @@ class PullDenseWorker {
  public:
   virtual ~PullDenseWorker() {}
   virtual void Initialize(const TrainerDesc& param);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void AddStream(const gpuStream_t stream) { copy_streams_.push_back(stream); }
 #endif
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
   void AddPlace(const paddle::platform::Place place) {
     places_.push_back(place);
   }
@@ -155,7 +155,7 @@ class PullDenseWorker {
   float total_batch_num_ = 0;
   std::unordered_map<const Scope*, int> scope_to_thread_id_;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
 #endif
   std::vector<paddle::platform::Place> places_;
@@ -186,7 +186,7 @@ class DeviceWorker {
   virtual void ProduceTasks() {}
   virtual void GetXpuOpIndex() {}
   virtual void Schedule(int taskid UNUSED) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream UNUSED) {}
   virtual void SetEvent(const gpuEvent_t event UNUSED) {}
 #endif
@@ -588,7 +588,7 @@ class HeterCpuWorker : public HogwildWorker {
 };
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 class PSGPUWorker : public HogwildWorker {
@@ -604,7 +604,7 @@ class PSGPUWorker : public HogwildWorker {
     new (&program_) ProgramDesc(main_program);
   }
   void ProduceTasks() override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; }
   virtual void SetEvent(const gpuEvent_t event) { event_ = event; }
 #endif
@@ -672,7 +672,7 @@ class PSGPUWorker : public HogwildWorker {
   std::unordered_map<uint64_t, std::unordered_set<uint64_t>> feasign_set_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> pull_queue_;
   paddle::framework::Channel<std::shared_ptr<HeterTask>> push_queue_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
   gpuStream_t copy_stream_;
 #endif
@@ -718,7 +718,7 @@ class PSGPUWorker : public HogwildWorker {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 class SectionWorker : public DeviceWorker {
  public:
   SectionWorker() {}
@@ -845,7 +845,7 @@ class HeterSectionWorker : public DeviceWorker {
   Scope* GetThreadScope() override { return minibatch_scope_; }
 
   // multi-stream
-  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   //  void SetStream(const gpuStream_t stream) override {}
   //  void SetEvent(const gpuEvent_t event) override {}
   // #endif
diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc
index 5c920fa3e318f9..c4ef22ebfe82cb 100644
--- a/paddle/fluid/framework/device_worker_factory.cc
+++ b/paddle/fluid/framework/device_worker_factory.cc
@@ -77,13 +77,13 @@ REGISTER_DEVICE_WORKER_CLASS(HeterSectionWorker);
 REGISTER_DEVICE_WORKER_CLASS(HeterCpuWorker);
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL|| \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 REGISTER_DEVICE_WORKER_CLASS(SectionWorker);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc
index 1e1a02f944f65b..4c6e19fd964bb1 100644
--- a/paddle/fluid/framework/dlpack_tensor.cc
+++ b/paddle/fluid/framework/dlpack_tensor.cc
@@ -96,7 +96,7 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLGPU;
     device.device_id = place.device;  // NOLINT
@@ -108,7 +108,7 @@ struct DLDeviceVisitor {
   }
 
   inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     ::DLDevice device;
     device.device_type = kDLCPUPinned;
     device.device_id = 0;
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 5dee8b04e78b7b..659bdcaaf95164 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -50,6 +50,12 @@ if(WITH_HETERPS)
       SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
       DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
     add_subdirectory(heter_ps)
+  elseif(WITH_MCCL)
+    musa_library(
+      ps_gpu_wrapper
+      SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
+      DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
+    add_subdirectory(heter_ps)    
   endif()
 else()
   cc_library(
@@ -58,7 +64,7 @@ else()
     DEPS gloo_wrapper)
 endif()
 
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   cc_library(
     nccl_wrapper
     SRCS nccl_wrapper.cc
@@ -77,6 +83,12 @@ if(WITH_BOX_PS)
       SRCS box_wrapper.cc box_wrapper.cu
       DEPS framework_proto lod_tensor box_ps)
   endif()
+  if(WITH_MUSA)
+    musa_library(
+      box_wrapper
+      SRCS box_wrapper.cc box_wrapper.cu
+      DEPS framework_proto lod_tensor box_ps)
+  endif()  
 else()
   cc_library(
     box_wrapper
diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu
index 5f46906cf8e823..0d1c4aba87dc57 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@@ -161,6 +161,11 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
             values.data(),
             values.size() * sizeof(float*),
             hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(gpu_values,
+            values.data(),
+            values.size() * sizeof(float*),
+            musaMemcpyHostToDevice);            
 #else
   cudaMemcpy(gpu_values,
              values.data(),
@@ -250,6 +255,10 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
                      slot_num,
                      total_len);
   hipStreamSynchronize(stream);
+#elif defined(PADDLE_WITH_MUSA)
+  CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
+      origin_keys, total_keys, gpu_len, slot_num, total_len);
+  musaStreamSynchronize(stream);
 #else
   CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
       origin_keys, total_keys, gpu_len, slot_num, total_len);
@@ -295,6 +304,19 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
             slot_vector_.data(),
             slot_lengths_lod.size() * sizeof(int),
             hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(gpu_values,
+             grad_values.data(),
+             grad_values.size() * sizeof(float*),
+             musaMemcpyHostToDevice);
+  musaMemcpy(gpu_len,
+             slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t),
+             musaMemcpyHostToDevice);
+  musaMemcpy(d_slot_vector,
+             slot_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int),
+             musaMemcpyHostToDevice);
 #else
   cudaMemcpy(gpu_values,
              grad_values.data(),
diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h
index 9853c328cd14e9..b3432277805a7e 100644
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
@@ -595,6 +595,9 @@ class BoxWrapper {
       data->resize(len);
 #ifdef PADDLE_WITH_HIP
       hipMemcpy(data->data(), gpu_data, sizeof(T) * len, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpy(
+          data->data(), gpu_data, sizeof(T) * len, musaMemcpyDeviceToHost);
 #else
       cudaMemcpy(
           data->data(), gpu_data, sizeof(T) * len, cudaMemcpyDeviceToHost);
diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h
index d72e418aadd3ef..9eb4360e7dd08d 100644
--- a/paddle/fluid/framework/fleet/box_wrapper_impl.h
+++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h
@@ -44,7 +44,7 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
     VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
     int device_id = place.GetDeviceId();
     phi::DenseTensor& total_keys_tensor = keys_tensor[device_id];
@@ -70,6 +70,15 @@ void BoxWrapper::PullSparseCase(const paddle::platform::Place& place,
               slot_lengths_lod.data(),
               slot_lengths.size() * sizeof(int64_t),
               hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(gpu_keys,
+              keys.data(),
+              keys.size() * sizeof(uint64_t*),
+              musaMemcpyHostToDevice);
+    musaMemcpy(gpu_len,
+              slot_lengths_lod.data(),
+              slot_lengths.size() * sizeof(int64_t),
+              musaMemcpyHostToDevice);              
 #else
     cudaMemcpy(gpu_keys,
                keys.data(),
@@ -153,7 +162,7 @@ void BoxWrapper::PushSparseGradCase(
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in PaddleBox now."));
   } else if (platform::is_gpu_place(place)) {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
     int device_id = place.GetDeviceId();
     phi::DenseTensor& cached_total_keys_tensor = keys_tensor[device_id];
     uint64_t* total_keys =
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 05433c1014656f..7ac9e4f7302a66 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -784,7 +784,7 @@ void FleetWrapper::PushDenseVarsSync(
     const uint64_t table_id,
     const std::vector<std::string>& var_names) {}
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
     (defined PADDLE_WITH_PSLIB)
 void FleetWrapper::PushDenseVarsAsync(
     const Scope& scope,
@@ -816,6 +816,9 @@ void FleetWrapper::PushDenseVarsAsync(
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, stream));
     hipEventSynchronize(event);
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, stream));
+    musaEventSynchronize(event);
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, stream));
     cudaEventSynchronize(event);
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h
index fb5cf917292566..1284b379c9f20b 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.h
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.h
@@ -175,7 +175,7 @@ class FleetWrapper {
 // Push dense variables to server in async mode
 // Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
 // Param<out>: push_sparse_status
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void PushDenseVarsAsync(
       const Scope& scope,
       const uint64_t table_id,
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 0af67107f0cbc6..1dbd675073dd7a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -96,3 +96,18 @@ if(WITH_ROCM)
     SRCS heter_ps.cu
     DEPS heter_comm)
 endif()
+if(WITH_MUSA)
+  musa_library(
+    heter_comm
+    SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h
+         hashtable.h
+    DEPS cub device_context)
+  musa_test(
+    test_heter_comm
+    SRCS feature_value.h
+    DEPS heter_comm)
+  musa_library(
+    heter_ps
+    SRCS heter_ps.cu
+    DEPS heter_comm)
+endif()
\ No newline at end of file
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
index 3bf395071df274..b5d788840ee547 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
@@ -846,7 +846,7 @@ void GraphGpuWrapper::init_service() {
     inter_comms_.resize(dev_size);
     if (gloo->Rank() == 0) {
       for (int i = 0; i < dev_size; ++i) {
-        platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]);
+        platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]);
       }
     }
 
@@ -860,13 +860,13 @@ void GraphGpuWrapper::init_service() {
     opts.setRoot(0);
     gloo::broadcast(opts);
 
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
     for (int i = 0; i < dev_size; ++i) {
       platform::CUDADeviceGuard guard(device_id_mapping[i]);
       platform::dynload::ncclCommInitRank(
           &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank());
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
 
     rank_id_ = gloo->Rank();
     node_size_ = gloo->Size();
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
index 315a9860ed67a2..4045c615a27cb3 100644
--- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.h
@@ -22,7 +22,7 @@
 #include "paddle/fluid/framework/fleet/heter_ps/gpu_graph_node.h"
 
 #ifdef PADDLE_WITH_HETERPS
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/random.h>
@@ -302,9 +302,9 @@ class GraphGpuWrapper {
   int node_size_ = 1;
   int multi_node_ = 0;
 #ifdef PADDLE_WITH_CUDA
-  std::vector<ncclComm_t> inner_comms_;
-  std::vector<ncclComm_t> inter_comms_;
-  std::vector<ncclUniqueId> inter_ncclids_;
+  std::vector<mcclComm_t> inner_comms_;
+  std::vector<mcclComm_t> inter_comms_;
+  std::vector<mcclUniqueId> inter_ncclids_;
 #endif
 };  // class GraphGpuWrapper
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 18e3966b220c0c..b869ad1c235cb6 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -166,8 +166,8 @@ class HeterComm {
                         size_t len,
                         Sgd& sgd);  // NOLINT
 
-  void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
-                              const std::vector<ncclComm_t>& inter_comms,
+  void set_nccl_comm_and_size(const std::vector<mcclComm_t>& inner_comms,
+                              const std::vector<mcclComm_t>& inter_comms,
                               int comm_size,
                               int rank_id) {
     nccl_inner_comms_ = inner_comms;
@@ -791,8 +791,8 @@ class HeterComm {
 
 #if defined(PADDLE_WITH_CUDA)
   GpuRDMAChecker* rdma_checker_ = nullptr;
-  std::vector<ncclComm_t> nccl_inner_comms_;
-  std::vector<ncclComm_t> nccl_inter_comms_;
+  std::vector<mcclComm_t> nccl_inner_comms_;
+  std::vector<mcclComm_t> nccl_inter_comms_;
   int multi_mf_dim_{8};
   int max_mf_dim_ = 8;
   std::vector<std::shared_ptr<cub::CachingDeviceAllocator>> allocators_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
index 36fe556bcf3fbd..3df6e6e89861ff 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
@@ -2870,7 +2870,7 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::send_data_by_all2all(
   auto &loc = storage_[gpu_id];
   auto nccl_stream = resource_->comm_stream(gpu_id, 0);
   size_t total_fea_num = 0;
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
   for (int i = 0; i < nccl_node_size; i++) {
     if (i == nccl_rank_id) {
       continue;
@@ -2881,7 +2881,7 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::send_data_by_all2all(
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclSend(&d_send_buff[send_offset],
                                       send_size * value_bytes,
-                                      ncclInt8,
+                                      mcclInt8,
                                       i,
                                       comm,
                                       nccl_stream));
@@ -2893,14 +2893,14 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::send_data_by_all2all(
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
           reinterpret_cast<void *>(&d_rev_buff[recv_offset]),
           recv_size * value_bytes,
-          ncclInt8,
+          mcclInt8,
           i,
           comm,
           nccl_stream));
       total_fea_num += recv_size;
     }
   }
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream));
 
   return total_fea_num;
@@ -2959,11 +2959,11 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::
   cache.node_barrier_.Resume();
   auto &comm = nccl_inter_comms_[gpu_id];
   auto nccl_stream = resource_->comm_stream(gpu_id, 0);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(
       &res.d_node_size_ptr[rank_offset],
       reinterpret_cast<void *>(res.d_node_size_ptr),
       node_size_,
-      ncclInt,
+      mcclInt,
       comm,
       nccl_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream));
@@ -3780,11 +3780,11 @@ size_t HeterComm<KeyType, ValType, GradType, GPUAccessor>::
   my_cache.node_barrier_.Resume();
   auto &comm = nccl_inter_comms_[gpu_id];
   auto nccl_stream = resource_->comm_stream(gpu_id, 0);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(
       &res.d_node_size_ptr[rank_id_ * node_size_],
       reinterpret_cast<void *>(res.d_node_size_ptr),
       node_size_,
-      ncclInt,
+      mcclInt,
       comm,
       nccl_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(nccl_stream));
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
index 3fe05753e09a31..017e3726357b9a 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
@@ -134,8 +134,8 @@ void HeterPs<GPUAccessor, GPUOptimizer>::push_sparse(int num,
 
 template <typename GPUAccessor, template <typename T> class GPUOptimizer>
 void HeterPs<GPUAccessor, GPUOptimizer>::set_nccl_comm_and_size(
-    const std::vector<ncclComm_t>& inner_comms,
-    const std::vector<ncclComm_t>& inter_comms,
+    const std::vector<mcclComm_t>& inner_comms,
+    const std::vector<mcclComm_t>& inter_comms,
     int comm_size,
     int rank_id) {
   comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size, rank_id);
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
index c472c2ed75a9d6..d1c1d0c8b611bb 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h
@@ -49,8 +49,8 @@ class HeterPs : public HeterPsBase {
                 size_t chunk_size,
                 int stream_num) override;
 #if defined(PADDLE_WITH_CUDA)
-  void set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms,
-                              const std::vector<ncclComm_t>& inter_comms,
+  void set_nccl_comm_and_size(const std::vector<mcclComm_t>& inner_comms,
+                              const std::vector<mcclComm_t>& inter_comms,
                               int comm_size,
                               int rank_id) override;
   void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
index 8624425d8bfbd2..b729cdfcbb0f96 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
@@ -46,8 +46,8 @@ class HeterPsBase {
   virtual int get_index_by_devid(int devid) = 0;
 #if defined(PADDLE_WITH_CUDA)
   virtual void set_nccl_comm_and_size(
-      const std::vector<ncclComm_t>& inner_comms,
-      const std::vector<ncclComm_t>& inter_comms,
+      const std::vector<mcclComm_t>& inner_comms,
+      const std::vector<mcclComm_t>& inter_comms,
       int comm_size,
       int rank_id) = 0;
   virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc
index a8ce9be92bdf68..97b704b4f3d219 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.cc
+++ b/paddle/fluid/framework/fleet/heter_wrapper.cc
@@ -121,7 +121,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
            tensor->numel() *
                SizeOfType(framework::TransToProtoVarType(tensor->dtype())));
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     memory::Copy(platform::CPUPlace(),
                  data_ptr,
                  tensor->place(),
@@ -141,7 +141,7 @@ void HeterWrapper::SerializeToReq(const std::string& varname,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void HeterWrapper::DeSerializeToTensor(Scope* scope,
                                        const VariableMessage& req_var,
                                        platform::Place place,
@@ -169,7 +169,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope,
   void* tensor_data = tensor->mutable_data(
       place, framework::TransToPhiDataType(ToVarType(req_var.data_type())));
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   memory::Copy(place,
                tensor_data,
                platform::CPUPlace(),
diff --git a/paddle/fluid/framework/fleet/heter_wrapper.h b/paddle/fluid/framework/fleet/heter_wrapper.h
index 77838fbec6d00e..70cbce2acc24d7 100644
--- a/paddle/fluid/framework/fleet/heter_wrapper.h
+++ b/paddle/fluid/framework/fleet/heter_wrapper.h
@@ -92,7 +92,7 @@ class HeterWrapper {
 
   framework::proto::VarType::Type ToVarType(VariableMessage::Type type);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void DeSerializeToTensor(Scope* scope,
                            const VariableMessage& req_var,
                            platform::Place place,
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
index 640f7dd08dc8d1..8be530c3170ba3 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.cc
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -21,9 +21,9 @@ std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
 bool NCCLWrapper::is_initialized_ = false;
 
 void NCCLWrapper::InitNCCL() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::ncclCommInitRank(&(nccl_info_.comm_),
+      platform::dynload::mcclCommInitRank(&(nccl_info_.comm_),
                                           nccl_info_.global_ranks_,
                                           nccl_info_.nccl_id_,
                                           nccl_info_.my_global_rank_));
@@ -32,16 +32,16 @@ void NCCLWrapper::InitNCCL() {
 }
 
 void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   nccl_info_.nccl_id_ = nccl_info.nccl_id_;
 #endif
   return;
 }
 
 NCCLInfo NCCLWrapper::GetNCCLId() {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
+      platform::dynload::mcclGetUniqueId(&(nccl_info_.nccl_id_)));
 #endif
   return nccl_info_;
 }
@@ -49,13 +49,15 @@ NCCLInfo NCCLWrapper::GetNCCLId() {
 void NCCLWrapper::SetRankInfo(const int local_rank,
                               const int global_rank,
                               const int ranks) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   nccl_info_.local_rank_ = local_rank;
   nccl_info_.my_global_rank_ = global_rank;
   nccl_info_.global_ranks_ = ranks;
   platform::SetDeviceId(local_rank);
 #ifdef PADDLE_WITH_RCCL
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&(nccl_info_.stream_)));
+#elif defined(PADDLE_WITH_MCCL)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&(nccl_info_.stream_)));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&(nccl_info_.stream_)));
 #endif
@@ -66,20 +68,22 @@ void NCCLWrapper::SetRankInfo(const int local_rank,
 void NCCLWrapper::SyncVar(const int root_rank,
                           const Scope& scope,
                           const std::vector<std::string>& var_names) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   for (auto& name : var_names) {
     auto var = scope.FindVar(name);
     phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
     int32_t total_size = tensor->numel();
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
         reinterpret_cast<void*>(tensor->data<float>()),
         total_size,
-        ncclFloat,
+        mcclFloat,
         root_rank,
         nccl_info_.comm_,
         nccl_info_.stream_));
 #ifdef PADDLE_WITH_RCCL
     hipStreamSynchronize(nccl_info_.stream_);
+#elif defined(PADDLE_WITH_MCCL)
+    musaStreamSynchronize(nccl_info_.stream_);
 #else
     cudaStreamSynchronize(nccl_info_.stream_);
 #endif
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
index 7e9cc0c56a6b46..46cdae20395e91 100644
--- a/paddle/fluid/framework/fleet/nccl_wrapper.h
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -31,6 +31,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
+#ifdef PADDLE_WITH_MCCL
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
+
 #include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
 
 namespace paddle {
@@ -51,9 +55,9 @@ class NCCLInfo {
   int local_rank_;
   int global_ranks_;
   int my_global_rank_;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  ncclUniqueId nccl_id_;
-  ncclComm_t comm_;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+  mcclUniqueId nccl_id_;
+  mcclComm_t comm_;
   gpuStream_t stream_;
 #endif
 };
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
index edfa4048b55287..85fe092e963db2 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h
@@ -314,7 +314,7 @@ class PSGPUWrapper {
         inter_comms_.resize(dev_size);
         if (gloo->Rank() == 0) {
           for (int i = 0; i < dev_size; ++i) {
-            platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]);
+            platform::dynload::mcclGetUniqueId(&inter_ncclids_[i]);
           }
         }
 
@@ -328,13 +328,13 @@ class PSGPUWrapper {
         opts.setRoot(0);
         gloo::broadcast(opts);
 
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
         for (int i = 0; i < dev_size; ++i) {
           platform::CUDADeviceGuard guard(dev_ids[i]);
           platform::dynload::ncclCommInitRank(
               &inter_comms_[i], gloo->Size(), inter_ncclids_[i], gloo->Rank());
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
 
         rank_id_ = gloo->Rank();
         node_size_ = gloo->Size();
@@ -979,9 +979,9 @@ class PSGPUWrapper {
   uint64_t table_id_;
   int gpu_graph_mode_ = 0;
 #ifdef PADDLE_WITH_CUDA
-  std::vector<ncclComm_t> inner_comms_;
-  std::vector<ncclComm_t> inter_comms_;
-  std::vector<ncclUniqueId> inter_ncclids_;
+  std::vector<mcclComm_t> inner_comms_;
+  std::vector<mcclComm_t> inter_comms_;
+  std::vector<mcclUniqueId> inter_ncclids_;
 #endif
   std::vector<int> heter_devices_;
   std::unordered_set<std::string> gpu_ps_config_keys_;
diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc
index d0620381ae8e91..5f9db8c20d51ff 100644
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include <functional>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/framework/garbage_collector.h"
@@ -64,7 +64,7 @@ void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
     const platform::CUDAPlace &place, size_t max_memory_size)
     : GarbageCollector(place, max_memory_size) {}
@@ -93,6 +93,8 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place,
   platform::CUDADeviceGuard guard(place.device);
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreate(&stream_));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream_));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreate(&stream_));
   callback_manager_ =
@@ -201,7 +203,7 @@ std::unique_ptr<GarbageCollector> CreateGarbageCollector(
     const platform::Place &place, const size_t max_memory_size) {
   std::unique_ptr<GarbageCollector> gc = nullptr;
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (IsFastEagerDeletionModeEnabled()) {
       gc = std::make_unique<UnsafeFastGPUGarbageCollector>(place,
                                                            max_memory_size);
diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h
index 5376739624d6f3..f9d94600a513d9 100644
--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -85,7 +85,7 @@ class IPUGarbageCollector : public GarbageCollector {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
  public:
   UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place,
diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc
index b98094ab74101c..83dbe31d86a5a8 100644
--- a/paddle/fluid/framework/hogwild_worker.cc
+++ b/paddle/fluid/framework/hogwild_worker.cc
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/phi/core/flags.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
@@ -1202,20 +1202,20 @@ bool HogwildWorker::CheckBatchNum(int flag) {
     // comm_ctx->AllReduce only support allreduce on the whole tensor,
     // single element is not supported now.
     PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclAllReduce(&stat_ptr[flag],
+        platform::dynload::mcclAllReduce(&stat_ptr[flag],
                                          &stat_ptr[2],
                                          1,
                                          ncclFloat32,
-                                         ncclProd,
+                                         mcclProd,
                                          comm_ctx->GetNcclComm(),
                                          stream));
 
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag],
                                                                 &stat_ptr[2],
                                                                 1,
                                                                 ncclFloat32,
-                                                                ncclProd,
+                                                                mcclProd,
                                                                 comm->comm(),
                                                                 stream));
   }
@@ -1246,11 +1246,11 @@ bool HogwildWorker::GetPassEnd(int flag) {
   //  auto stream = static_cast<phi::GPUContext *>(dev_ctx_)->stream();
   //  PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
   auto stream = comm->stream();
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(&stat_ptr[flag],
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(&stat_ptr[flag],
                                                               &stat_ptr[2],
                                                               1,
                                                               ncclFloat32,
-                                                              ncclProd,
+                                                              mcclProd,
                                                               comm->comm(),
                                                               stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&ret,  // output
@@ -1267,7 +1267,7 @@ bool HogwildWorker::GetPassEnd(int flag) {
 void HogwildWorker::TrainFilesWithProfiler() {
   platform::SetNumThreads(1);
 #if defined(PADDLE_WITH_HETERPS) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
@@ -1473,7 +1473,7 @@ void HogwildWorker::TrainFiles() {
   platform::Timer timeline;
   timeline.Start();
 #if defined(PADDLE_WITH_HETERPS) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_HETERPS) && defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 46183fd93e97fd..d0c11c3098ddb2 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,7 +3,7 @@ add_subdirectory(memory_optimize_pass)
 add_subdirectory(multi_devices_graph_pass)
 if(NOT APPLE
    AND NOT WIN32
-   AND (WITH_GPU OR WITH_ROCM))
+   AND (WITH_GPU OR WITH_ROCM OR WITH_MUSA))
   add_subdirectory(fusion_group)
 endif()
 
@@ -169,7 +169,7 @@ if(WITH_TENSORRT)
   pass_library(trt_remove_amp_strategy_op_pass inference)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   pass_library(cudnn_placement_pass base DEPS placement_pass_base)
   pass_library(embedding_eltwise_layernorm_fuse_pass inference)
 endif()
@@ -493,7 +493,7 @@ cc_test(
   SRCS relu6_fuse_pass_test.cc
   DEPS relu6_fuse_pass)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   cc_test(
     test_embedding_eltwise_layernorm_fuse_pass
     SRCS embedding_eltwise_layernorm_fuse_pass_tester.cc
@@ -543,7 +543,7 @@ if(WITH_MKLDNN)
       device_context
       phi
       common)
-  if(WITH_GPU OR WITH_ROCM)
+  if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
   endif()
   cc_test(
diff --git a/paddle/fluid/framework/ir/cost_model.cc b/paddle/fluid/framework/ir/cost_model.cc
index a54138060283bc..a28930961efa0e 100644
--- a/paddle/fluid/framework/ir/cost_model.cc
+++ b/paddle/fluid/framework/ir/cost_model.cc
@@ -128,7 +128,7 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[op_push_index].CpuElapsedMs(
         main_thread_events[op_pop_index]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[op_push_index].CudaElapsedMs(
         main_thread_events[op_pop_index]);
 #endif
@@ -152,7 +152,7 @@ bool CostData::SetCostData(const ProgramDesc& program,
     double cpu_time_ms = main_thread_events[start_profiler_idx].CpuElapsedMs(
         main_thread_events[stop_profiler_idx]);
     double gpu_time_ms = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     gpu_time_ms = main_thread_events[start_profiler_idx].CudaElapsedMs(
         main_thread_events[stop_profiler_idx]);
 #endif
diff --git a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
index 048b33a649f94d..e0a9502c685d25 100644
--- a/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_act_pass.cc
@@ -34,8 +34,8 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
   graph = FuseBatchNormAct(graph, act_types);
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
index 2a24c5476a5010..36fa8a3331e7e1 100644
--- a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -25,8 +25,8 @@ namespace framework {
 namespace ir {
 
 void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 4, 1)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDNN_VERSION_MIN(7, 4, 1)
   // forward
   std::unordered_set<std::string> act_types = {"relu"};
   graph = FuseBatchNormAddAct(graph, act_types);
diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index 570b081aae95ed..390dd25b9cf5dd 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -2,7 +2,7 @@ cc_library(
   code_generator
   SRCS operation.cc code_generator.cc code_generator_helper.cc
   DEPS graph subgraph_detector)
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   cc_test(
     test_code_generator
     SRCS code_generator_tester.cc
diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
index 9749fb2bfa81c5..92c1c1c6f02077 100644
--- a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
+++ b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -27,7 +27,7 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
index 195b29a9794a9a..232e9bbf43607f 100644
--- a/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
+++ b/paddle/fluid/framework/ir/fusion_group/cuda_resources.h
@@ -34,7 +34,7 @@ __device__ inline double Log(double x) { return log(x); }
 __device__ inline double Sqrt(double x) { return sqrt(x); }
 
 )";
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 static constexpr char predefined_cuda_functions_fp16[] = R"(
 __device__ inline __half Exp(const __half x) { return hexp(x); }
 __device__ inline __half Log(const __half x) { return hlog(x); }
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 30a001777bd587..17910d7dfae80b 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_utils.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/framework/details/nccl_op_handle.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -513,7 +513,7 @@ static OpDesc *ReplaceScaleLossGradOp(const Node &node, OpDesc *desc) {
 void ReplaceAllReduceOp(const Node &node,
                         proto::BlockDesc *block,
                         std::vector<OpDesc> *ops) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   bool is_fused = (node.Name() == "fused_all_reduce");
 
   details::OpHandleBase &op_handle =
@@ -688,7 +688,7 @@ static void GetGraphOpDesc(const std::vector<Node *> &nodes,
       ops->emplace_back();
       auto &desc = ops->back();
       ReplaceScaleLossGradOp(*n, &desc);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     } else if ((n->Name() == "allreduce" || n->Name() == "fused_all_reduce") &&
                dynamic_cast<details::NCCLOpHandleBase *>(
                    &(n->Wrapper<details::OpHandleBase>())) != nullptr) {
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index 9c60a665de0021..c2a8c1bc73e8ea 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -204,7 +204,7 @@ TEST(test_reference_count_pass, test_no_need_buffer_var_shrink) {
            {});
 
   std::vector<bool> use_cuda_list{false};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   use_cuda_list.push_back(true);
 #endif
   for (auto use_cuda : use_cuda_list) {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
index 0dcf316c33c696..4579e172ef665e 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc
@@ -30,7 +30,7 @@ class AllReduceDepsPass : public ir::Pass {
     std::vector<details::OpHandleBase*> all_reduce_op_handles =
         GetSortedAllReduceOps(*graph);
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto use_hierarchical_allreduce =
         Get<bool>(details::kUseHierarchicalAllReduce);
     for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
index dc18979260f928..a24fd784bb4088 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
@@ -37,7 +37,7 @@ class FuseAllReduceOpPass : public ir::Pass {
     auto &places = Get<const std::vector<platform::Place>>(details::kPlaces);
     auto &local_scopes = Get<const std::vector<Scope *>>(details::kLocalScopes);
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto *multi_nccl_ctxs =
         &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -95,7 +95,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       for (auto &p_g : group_p_g) {
         group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second));
       }
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       InsertFusedAllReduce(places,
                            local_scopes,
                            group_size,
@@ -177,7 +177,7 @@ class FuseAllReduceOpPass : public ir::Pass {
                             const std::vector<Scope *> &local_scopes,
                             const size_t num_of_all_reduce,
                             const std::vector<ir::Node *> &all_reduce_ops,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
                             const platform::NCCLCommunicator *multi_nccl_ctxs,
 #elif defined(PADDLE_WITH_XPU_BKCL)
                             const platform::BKCLCommunicator *multi_bkcl_ctxs,
@@ -244,7 +244,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       result->RemoveNode(op_handle.Node());
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     CreateFusedAllReduceOp(inputs,
                            outputs,
                            num_of_all_reduce,
@@ -285,7 +285,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       const std::vector<Scope *> &local_scopes,
       bool is_grad_merge,
       const std::string &grad_merge_cond_name,
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       const platform::NCCLCommunicator *multi_nccl_ctxs,
 #elif defined(PADDLE_WITH_XPU_BKCL)
       const platform::BKCLCommunicator *multi_bkcl_ctxs,
@@ -293,7 +293,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       ir::Graph *result) const {
     details::FusedAllReduceOpHandle *op_handle = nullptr;
     if (is_grad_merge) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       op_handle = new details::FusedGradMergeAllReduceOpHandle(
           result->CreateEmptyNode("fused_all_reduce",
                                   ir::Node::Type::kOperation),
@@ -321,7 +321,7 @@ class FuseAllReduceOpPass : public ir::Pass {
           grad_merge_cond_name);
 #endif
     } else {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       op_handle = new details::FusedAllReduceOpHandle(
           result->CreateEmptyNode("fused_all_reduce",
                                   ir::Node::Type::kOperation),
@@ -355,7 +355,7 @@ class FuseAllReduceOpPass : public ir::Pass {
       op_handle->AddOutput(out);
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     if (!multi_nccl_ctxs) {
       SetCommunicationContext(places, op_handle);
     }
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 295ef57cfdfead..9e7b22b8930cca 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -170,7 +170,7 @@ void MultiDevSSAGraphBuilderBase::Init() const {
   places_ = Get<const std::vector<platform::Place>>(details::kPlaces);
   local_scopes_ = Get<const std::vector<Scope *>>(details::kLocalScopes);
   strategy_ = Get<const details::BuildStrategy>(kStrategy);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   multi_nccl_ctxs_ = &Get<platform::NCCLCommunicator>(details::kNCCLCtxs);
   nccl_ctxs_ = nullptr;
   if (multi_nccl_ctxs_) {
@@ -338,7 +338,7 @@ std::vector<ir::Node *> MultiDevSSAGraphBuilderBase::SortOperations(
 
 bool MultiDevSSAGraphBuilderBase::UseGPU() const {
   bool use_gpu = false;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   use_gpu = nccl_ctxs_ != nullptr;
 #endif
   return use_gpu;
@@ -389,7 +389,7 @@ void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result,
 
 void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
     details::OpHandleBase *op_handle, const platform::Place &p) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (nccl_ctxs_ == nullptr) {
     op_handle->SetDeviceContext(p,
                                 platform::DeviceContextPool::Instance().Get(p));
@@ -408,7 +408,7 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
 void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
                                                     const std::string &p_name,
                                                     size_t src_dev_id) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto *op_handle = new details::BroadcastOpHandle(
       result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
       local_scopes_,
@@ -453,7 +453,7 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
 void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
     ir::Graph *result,
     const std::vector<std::unordered_set<std::string>> &bcast_varnames) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto *op_handle = new details::FusedBroadcastOpHandle(
       result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation),
       local_scopes_,
@@ -534,7 +534,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
       -> details::OpHandleBase * {
     if (is_encoded) {
 #if defined(PADDLE_WITH_DGC) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::SparseAllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -553,7 +553,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
       grad_merge_cond_name = PADDLE_GET_CONST(
           std::string, node->Op()->GetAttr(GRAD_MERGE_COND_NAME));
       VLOG(10) << "og=" << og << " use grad_merge_allreduce";
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::GradMergeAllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -578,7 +578,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
               grad_merge_cond_name));
 #endif
     } else {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       result->Get<GraphOps>(kGraphOps).emplace_back(
           new details::AllReduceOpHandle(
               result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -718,7 +718,7 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps(
 
 details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
     ir::Graph *result, const std::string &og, size_t dst_dev_id) const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   result->Get<GraphOps>(kGraphOps).emplace_back(new details::ReduceOpHandle(
       result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
       local_scopes_,
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
index 9e8fb5202a2d57..397922ad4bc88a 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
@@ -39,7 +39,7 @@ class Graph;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 class NCCLCommunicator;
 class NCCLContextMap;
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -126,7 +126,7 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
 
   void CreateIsolatedVarNode(ir::Graph *result, ir::Node *var_node) const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   mutable platform::NCCLContextMap *nccl_ctxs_{nullptr};
   mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr};
 #elif defined(PADDLE_WITH_XPU_BKCL)
diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
index debc3be7a32e00..976cd32e8ae515 100644
--- a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
+++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc
@@ -34,7 +34,7 @@
 #include "paddle/fluid/framework/new_executor/pir_adaptor/pir_adaptor_util.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/pir/core/block_argument.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -105,7 +105,7 @@ platform::DeviceContext* ParseDeviceContext(
       return dev_ctx;
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
     // with use_cal_stream==false by returning a device context getting from the
     // global NCCLCommContext instance. Because when use_calc_stream==false, in
@@ -338,7 +338,7 @@ bool GetCondData(const phi::DenseTensor& cond) {
   // when platform::is_gpu_place(cond.place()) or
   // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   paddle::framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
diff --git a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
index 8383b1fdd1790c..a7434ad9d41819 100644
--- a/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/execution_config.cc
@@ -53,7 +53,7 @@ inline std::tuple<int, int> GetThreadPoolConfig(const phi::Place& place,
     processor_count = static_cast<int>(std::thread::hardware_concurrency());
     if (processor_count) {
       if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         device_count = phi::backends::gpu::GetGPUDeviceCount();
 #endif
       }
diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
index 46b9247728d63e..491370d4198fbf 100644
--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -749,7 +749,7 @@ void BuildOpFuncList(const platform::Place& place,
             *op_with_kernel, *runtime_scope, *dev_ctx, runtime_context);
         auto expected_kernel_key = framework::TransPhiKernelKeyToOpKernelType(
             op_with_kernel->GetExpectedKernelType(exec_ctx));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         if (op_with_kernel->CanCUDNNBeUsed(exec_ctx,
                                            expected_kernel_key.data_type_)) {
           expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
index 5b60205fbc529f..bc273000e626f5 100644
--- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
 #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
 #include "paddle/fluid/platform/device_context.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -229,7 +229,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(
       return dev_ctx;
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
     // with use_cal_stream==false by returning a device context getting from the
     // global NCCLCommContext instance. Because when use_calc_stream==false, in
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
index ff5832ba8335e6..f6a5ed407c3f34 100644
--- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h
+++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -48,7 +48,7 @@ PD_DECLARE_bool(benchmark);
 PHI_DECLARE_uint64(executor_log_deps_every_microseconds);
 PHI_DECLARE_bool(new_executor_use_cuda_graph);
 PHI_DECLARE_bool(enable_pir_in_executor);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -121,7 +121,7 @@ class InterpreterBaseImpl {
 inline void SetDeviceId(const platform::Place& place) {
   // TODO(zhiqiu): reduce the cost
   if (platform::is_gpu_place(place)) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     PADDLE_THROW(platform::errors::Unavailable(
         "Cannot run operator on place %s, please recompile paddle or "
         "reinstall Paddle with CUDA support.",
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index a336e2c377dfd1..ee7587140b9234 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -314,7 +314,7 @@ void Instruction::AddInplace(Variable* in, Variable* out) {
 
 void Instruction::ClearInplace() { vec_inplace_in_to_out_.clear(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void Instruction::UpdataRecordStreamForGcInfo() {
   if (!IsInterpretercoreFastGCEnabled() ||
       KernelType() != OpFuncType::kGpuAsync) {
@@ -328,7 +328,7 @@ void Instruction::UpdataRecordStreamForGcInfo() {
   stream_ = reinterpret_cast<const phi::GPUContext&>(DeviceContext()).stream();
 // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now.
 // To support all the operators for communicating in the future.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto operator_base_ptr = OpBase();
   if ((operator_base_ptr->Type() == "send_v2") &&
       (operator_base_ptr->Attr<bool>("use_calc_stream") == false)) {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 66773746deb274..6e96c0e5c109fa 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -26,7 +26,7 @@
 #include "paddle/fluid/platform/event.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/utils/rw_lock.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -306,7 +306,7 @@ class Instruction {
   const OpFuncNode* OpFunc() const { return &op_func_node_; }
 
   // record stream for gc
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   bool need_record_stream_for_gc_ = false;
   gpuStream_t stream_{nullptr};
   void UpdataRecordStreamForGcInfo();
diff --git a/paddle/fluid/framework/new_executor/pir_interpreter.cc b/paddle/fluid/framework/new_executor/pir_interpreter.cc
index 66de40585130b5..fe64b51464214c 100644
--- a/paddle/fluid/framework/new_executor/pir_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/pir_interpreter.cc
@@ -64,7 +64,7 @@
 #include "paddle/pir/core/builtin_attribute.h"
 #include "paddle/pir/dialect/control_flow/ir/cf_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -857,7 +857,7 @@ void PirInterpreter::RecordMemcpyD2H(InstructionBase* instr_node) {
 }
 
 void PirInterpreter::RecordStreamForGC(InstructionBase* instr) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   PADDLE_THROW(platform::errors::Unimplemented(
       "RecordStreamForGC is only implemented when compiled with GPU."));
 #else
@@ -876,7 +876,7 @@ void PirInterpreter::RecordStreamForGC(InstructionBase* instr) {
       reinterpret_cast<const phi::GPUContext&>(instr->DeviceContext()).stream();
 // TODO(lizhiyu): Only analyse the 'send_v2' for GPT pp strategy right now.
 // To support all the operators for communicating in the future.
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (instr->Name() == "pd_op.send_v2") {
     ::pir::Operation* op = instr->Operation();
     if (op->HasAttribute("use_calc_stream") &&
@@ -998,7 +998,7 @@ void PirInterpreter::CheckGC(InstructionBase* instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   RecordStreamForGC(instr);
 #endif
 
@@ -1619,7 +1619,7 @@ void PirInterpreter::RunInstructionBase(InstructionBase* instr_node) {
 
       if (FLAGS_benchmark) {
         instr_node->DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
         VLOG(4) << "Operator(" << instr_node->Name()  // NOLINT
                 << "): context wait and get last error";
diff --git a/paddle/fluid/framework/new_executor/profiler.h b/paddle/fluid/framework/new_executor/profiler.h
index 95eee77d362883..f2fa9fd50eedbb 100644
--- a/paddle/fluid/framework/new_executor/profiler.h
+++ b/paddle/fluid/framework/new_executor/profiler.h
@@ -42,7 +42,7 @@ class ProfilerGuard {
  private:
   void TotalCUDAAllocatedMemorySize(const platform::Place& place) {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto cuda_place = place;
       cost_info_->device_memory_bytes =
           platform::RecordedGpuMallocSize(cuda_place.device);
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index d1ce9f55e46901..f0aefb94e6b691 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -32,7 +32,7 @@
 #endif
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/backends/device_manager.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
 
   PrepareForCUDAGraphCapture();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
 #endif
 }
@@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   start_time = calculate_stream_timer_->StartTime();
   end_time = calculate_stream_timer_->EndTime();
 #endif
@@ -701,7 +701,7 @@ void ProgramInterpreter::Convert(
 #endif
     vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     vec_instruction_.back().UpdataRecordStreamForGcInfo();
 #endif
   }
@@ -973,7 +973,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
         1,
         platform::EventRole::kInnerOp);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (is_in_op_profiling_mode_) {
       platform::GpuDeviceSync();
     }
@@ -1009,7 +1009,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
       OperatorDistAttr* op_dist_attr = block_.Op(op->Id())->MutableDistAttr();
       platform::Timer op_timer;
       op_timer.Start();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::GpuDeviceSync();
 #endif
       op_timer.Pause();
@@ -1040,7 +1040,7 @@ void ProgramInterpreter::RunOperator(const Instruction& instr_node) {
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     instr_node.DeviceContext().Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op->Type()  // NOLINT
             << "): context wait and get last error";
@@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (enable_job_schedule_profiler_) {
       if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" &&
           !interpreter::IsCommunicationOp(instr_node)) {
@@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
     }
 
     instr_node.RecordEvent(place_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (enable_job_schedule_profiler_) {
       if (instr_node.Id() == last_calculate_instr_id_ &&
           calculate_stream_timer_->IsStarted()) {
@@ -1320,7 +1320,7 @@ void ProgramInterpreter::RunInstructionAsync(size_t instr_id) {
 }
 
 void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   PADDLE_THROW(platform::errors::Unimplemented(
       "RecordStreamForGC is only implemented when compiled with GPU."));
 #else
@@ -1428,7 +1428,7 @@ void ProgramInterpreter::RecordStreamForGC(const Instruction& instr) {
 void ProgramInterpreter::CheckGC(const Instruction& instr) {
   platform::RecordEvent record(
       "CheckGC", platform::TracerEventType::UserDefined, 10);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (instr.need_record_stream_for_gc_) {
     RecordStreamForGC(instr);
   }
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index b19e3a06a42588..701da4f9473599 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #endif
 
@@ -234,7 +234,7 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   std::vector<HookFunc> output_hookfuncs_;
   std::vector<HookFunc> input_hookfuncs_;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::unique_ptr<phi::CalculateStreamTimer> calculate_stream_timer_;
 #endif
   size_t last_calculate_instr_id_;
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 84ee045918fcd7..f4a5f6d410eae0 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -359,7 +359,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
       ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
       __VA_ARGS__)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
 #else
@@ -446,7 +446,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType,
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 4ae5e0ebdf8720..318b00848bfedd 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -58,7 +58,7 @@ class DenseTensor;
 #include "paddle/fluid/platform/mkldnn_op_list.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
@@ -771,7 +771,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
   try {
     VLOG(4) << place << " " << DebugStringEx(&scope);
     if (platform::is_gpu_place(place)) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
       PADDLE_THROW(platform::errors::Unavailable(
           "Cannot run operator on place %s, please recompile paddle or "
           "reinstall Paddle with CUDA support.",
@@ -1539,7 +1539,7 @@ bool OperatorWithKernel::SupportsKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(exe_ctx, kernel_type.data_type_)) {
     auto tmp_kernel_type = kernel_type;
     tmp_kernel_type.library_type_ = framework::LibraryType::kCUDNN;
@@ -1567,12 +1567,12 @@ bool OperatorWithKernel::CanCUDNNBeUsed(const framework::ExecutionContext& ctx,
   bool use_cudnn = ctx.HasAttr("use_cudnn") && ctx.Attr<bool>("use_cudnn") &&
                    paddle::platform::is_gpu_place(ctx.GetPlace());
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (use_cudnn) {
     auto& dev_ctx = ctx.device_context<phi::GPUContext>();
     use_cudnn &= (dev_ctx.cudnn_handle() != nullptr);
   }
-#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP
+#endif  // PADDLE_WITH_CUDA || PADDLE_WITH_HIP || defined(PADDLE_WITH_MUSA)
 
 #if defined(PADDLE_WITH_CUDA)
   if (use_cudnn && data_type == phi::DataType::BFLOAT16) {
@@ -1808,7 +1808,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (this->CanCUDNNBeUsed(exe_ctx, kernel_type_->data_type_)) {
         kernel_type_->library_type_ = framework::LibraryType::kCUDNN;
       }
@@ -2071,7 +2071,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM)
+#if defined(PADDLE_WITH_CUDA) || defined(PADLDE_WITH_ROCM) || defined(PADLDE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
 #endif
     VLOG(4) << "Operator(" << Type() << "): context wait and get last error";
@@ -2134,7 +2134,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (this->CanCUDNNBeUsed(ctx, expected_kernel_key.data_type_)) {
     expected_kernel_key.library_type_ = framework::LibraryType::kCUDNN;
   }
@@ -2157,7 +2157,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
       // CPUKernel will be executed and a warning will be given at the same
       // time.
       expected_kernel_key.place_ = platform::CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (SupportGPU()) {
         auto& dev_ctx = ctx.device_context();
         expected_kernel_key.place_ = dev_ctx.GetPlace();
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index d51c0ce0f415d0..f8943d53f15909 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -584,7 +584,7 @@ class ExecutionContext : public phi::KernelContext {
     return device_context_;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   const inline phi::GPUContext& cuda_device_context() const {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()),
                       true,
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index e6c11df275b569..cef7e14a2a1b89 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -41,14 +41,14 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
 
 PHI_DECLARE_double(eager_delete_tensor_gb);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_bool(sync_nccl_allreduce);
 #endif
 
@@ -69,7 +69,7 @@ static std::once_flag gProfileOnce;
 static bool gProfileStarted = false;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 std::once_flag p2p_init_flag;
 #endif
 
@@ -148,7 +148,7 @@ class ParallelExecutorPrivate {
     }
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) {
     VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_
             << ", num_trainers:" << bst.num_trainers_
@@ -162,7 +162,7 @@ class ParallelExecutorPrivate {
               << bst.hierarchical_allreduce_exter_nranks_;
     }
 
-    std::vector<ncclUniqueId *> flat_nccl_ids;
+    std::vector<mcclUniqueId *> flat_nccl_ids;
     if (nranks_ == 1) {
       // FIXME(gongwb): need not to create ncclid when nranks==1
       nccl_ctxs_->InitFlatCtxs(
@@ -173,18 +173,18 @@ class ParallelExecutorPrivate {
     if (bst.enable_parallel_graph_) {
       VLOG(1) << "use only one ncclid in pg model";
 
-      ncclUniqueId *nccl_id = nullptr;
+      mcclUniqueId *nccl_id = nullptr;
 
       std::string var_name = platform::GetFlatNCCLVarName(0);
       auto nccl_id_var = scope->FindVar(var_name);
       if (nccl_id_var) {
-        nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+        nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
         VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id;
       } else {
-        nccl_id = new ncclUniqueId();
+        nccl_id = new mcclUniqueId();
         PADDLE_ENFORCE_EQ(
-            platform::dynload::ncclGetUniqueId(nccl_id),
-            ncclSuccess,
+            platform::dynload::mcclGetUniqueId(nccl_id),
+            mcclSuccess,
             platform::errors::PreconditionNotMet(
                 "PaddlePaddle failed to get NCCL unique ID. It may due to your "
                 "system settings or NCCL library error, please debug on NCCL"));
@@ -213,7 +213,7 @@ class ParallelExecutorPrivate {
       PADDLE_ENFORCE_NOT_NULL(
           nccl_id_var,
           platform::errors::NotFound("Can't find nccl_id_var '%s'.", var_name));
-      auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+      auto nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
       flat_nccl_ids.push_back(nccl_id);
     }
 
@@ -221,25 +221,25 @@ class ParallelExecutorPrivate {
         places_, flat_nccl_ids, bst.num_trainers_, bst.trainer_id_);
 
     if (bst.use_hierarchical_allreduce_) {
-      std::vector<ncclUniqueId *> inter_nccl_ids;
+      std::vector<mcclUniqueId *> inter_nccl_ids;
       for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
         std::string var_name = platform::GetHierarchicalInterNCCLVarName(i);
         auto nccl_id_var = scope->FindVar(var_name);
         PADDLE_ENFORCE_NOT_NULL(nccl_id_var,
                                 platform::errors::NotFound(
                                     "Can't find nccl_id_var '%s'.", var_name));
-        auto inter_nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+        auto inter_nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
         inter_nccl_ids.push_back(inter_nccl_id);
       }
 
-      std::vector<ncclUniqueId *> exter_nccl_ids;
+      std::vector<mcclUniqueId *> exter_nccl_ids;
       for (int i = 0; i < static_cast<int>(bst.nccl_comm_num_); i++) {
         std::string var_name = platform::GetHierarchicalExterNCCLVarName(i);
         auto nccl_id_var = scope->FindVar(var_name);
         PADDLE_ENFORCE_NOT_NULL(nccl_id_var,
                                 platform::errors::NotFound(
                                     "Can't find nccl_id_var '%s'.", var_name));
-        auto nccl_id = nccl_id_var->GetMutable<ncclUniqueId>();
+        auto nccl_id = nccl_id_var->GetMutable<mcclUniqueId>();
         exter_nccl_ids.push_back(nccl_id);
       }
 
@@ -400,7 +400,7 @@ class ParallelExecutorPrivate {
 
   std::unordered_map<std::string, bool> is_persistable_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   platform::NCCLCommunicator *nccl_ctxs_{nullptr};
 #elif defined(PADDLE_WITH_XPU_BKCL)
   platform::BKCLCommunicator *bkcl_ctxs_{nullptr};
@@ -512,7 +512,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
     }
     std::unique_ptr<GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (IsFastEagerDeletionModeEnabled()) {
         gc = std::make_unique<UnsafeFastGPUGarbageCollector>(place,
                                                              max_memory_size);
@@ -623,7 +623,7 @@ bool ParallelExecutor::NeedCreateLocalExeScope() {
 }
 
 void InitP2P(const std::vector<platform::Place> &places) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::call_once(p2p_init_flag, [&]() {
     int count = places.size();
     if (count <= 1) return;
@@ -644,6 +644,10 @@ void InitP2P(const std::vector<platform::Place> &places) {
         hipError_t ret =
             hipDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
         if (ret != hipSuccess || can_acess != 1) {
+#elif defined(PADDLE_WITH_MUSA)
+        musaError_t ret =
+            musaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
+        if (ret != musaSuccess || can_acess != 1) {
 #else
         cudaError_t ret =
             cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]);
@@ -655,6 +659,8 @@ void InitP2P(const std::vector<platform::Place> &places) {
           platform::CUDADeviceGuard guard(devices[i]);
 #ifdef PADDLE_WITH_HIP
           hipDeviceEnablePeerAccess(devices[j], 0);
+#elif defined(PADDLE_WITH_MUSA)
+          musaDeviceEnablePeerAccess(devices[j], 0);          
 #else
           cudaDeviceEnablePeerAccess(devices[j], 0);
 #endif
@@ -807,12 +813,12 @@ void ParallelExecutor::BCastParamsToDevices(
     }
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       std::vector<void *> buffers;
       buffers.reserve(member_->places_.size());
       size_t numel = main_tensor.numel();
       auto dtype = framework::TransToProtoVarType(main_tensor.dtype());
-      ncclDataType_t data_type = platform::ToNCCLDataType(dtype);
+      mcclDataType_t data_type = platform::ToNCCLDataType(dtype);
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
@@ -840,7 +846,7 @@ void ParallelExecutor::BCastParamsToDevices(
         platform::NCCLGroupGuard guard;
         for (size_t i = 0; i < member_->places_.size(); ++i) {
           auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]);
-          platform::dynload::ncclBcast(buffers[i],
+          platform::dynload::mcclBcast(buffers[i],
                                        numel,
                                        data_type,
                                        0,
@@ -1282,7 +1288,7 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
         BuildStrategy::ReduceStrategy::kAllReduce;
     member_->use_all_reduce_ = true;
   }
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && defined(_WIN32)
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
         device_count,
@@ -1291,8 +1297,8 @@ void ParallelExecutor::InitExecutorPrivateMemberInfo(
   }
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+    (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL) && !defined(PADDLE_WITH_MCCL))
   if (member_->IsUseCUDA(member_->use_device_)) {
     PADDLE_ENFORCE_EQ(
         device_count,
@@ -1450,7 +1456,7 @@ void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) {
   }
 
   if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_);
 
     // Initialize device context's nccl comm, will be used by normal
@@ -1501,7 +1507,7 @@ std::vector<ir::Graph *> ParallelExecutor::CompileGraphWithBuildStrategy(
   std::vector<ir::Graph *> async_graphs(device_count);
 
   auto &graphs = *device_graphs;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (member_->build_strategy_.async_mode_) {
     PADDLE_ENFORCE_EQ(graphs.size(),
                       device_count,
@@ -1656,7 +1662,7 @@ std::vector<ir::Graph *> ParallelExecutor::CreateSSAGraphExecutor(
     final_graphs = *async_graphs;
   } else if (member_->build_strategy_.enable_parallel_graph_) {
     VLOG(3) << "use ParallelSSAGraphExecutor";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     // TODO(Yancey1989): Remove passing in the main_program when
     // allreduce_seq_pass doesn't need it as the attr.
     bool is_inference = details::IsDataParallelInferenceGraph(*graph);
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index 32514089763c6e..48cd609d798e3d 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index cc5cf54724dabe..0b77e80a0b4658 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -134,7 +134,7 @@ phi::KernelKey FallBackToCpu(const phi::KernelKey& kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == phi::Backend::GPU ||
       kernel_key.backend() == phi::Backend::GPUDNN) {
     PADDLE_THROW(
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index d1eb5558c54541..e37957918fe401 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -72,7 +72,7 @@ struct ConvertToPhiContext<phi::CPUContext> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <>
 struct ConvertToPhiContext<phi::GPUContext> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc
index 4566927e068ca6..827e39c152640e 100644
--- a/paddle/fluid/framework/pipeline_trainer.cc
+++ b/paddle/fluid/framework/pipeline_trainer.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/trainer.h"
@@ -34,7 +34,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc,
   ParseDumpConfig(trainer_desc);
   const auto& section_config = section_params.section_config();
   int place_id = section_config.place_id();
-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)
+#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_RCCL)|| (defined PADDLE_WITH_MCCL)
   place_ = platform::CUDAPlace(place_id);
 #endif
   worker_ = DeviceWorkerFactory::CreateDeviceWorker(
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 4b629c24cf0e64..472eb5ef9b42f8 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker_factory.h"
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL ||defined PADDLE_WITH_MCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc
index 85fc30978f16a4..f1cc62bbfd3041 100644
--- a/paddle/fluid/framework/ps_gpu_worker.cc
+++ b/paddle/fluid/framework/ps_gpu_worker.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/lodtensor_printer.h"
 #include "paddle/fluid/string/string_helper.h"
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 #ifdef PADDLE_WITH_CUDA
@@ -286,7 +286,7 @@ void PSGPUWorker::TrainFiles() {
   timeline.Start();
 
   int total_ins_num = 0;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
@@ -511,7 +511,7 @@ void PSGPUWorker::TrainFilesWithProfiler() {
   int total_ins_num = 0;
   int cur_batch;
   timeline.Start();
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   platform::SetDeviceId(thread_id_);
 #elif defined(PADDLE_WITH_XPU_BKCL)
   platform::SetXPUDeviceId(thread_id_);
diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc
index f295fa7106dd43..8b740ea6156e20 100644
--- a/paddle/fluid/framework/pull_dense_worker.cc
+++ b/paddle/fluid/framework/pull_dense_worker.cc
@@ -69,11 +69,11 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
   fleet_ptr_ = FleetWrapper::GetInstance();
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   copy_streams_.clear();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
   places_.clear();
   thread_scopes_.clear();
 #endif
@@ -81,7 +81,7 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
 
 void PullDenseWorker::CreatePinVar() {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
   // for (auto& v : dense_value_names_) {
   //  for (auto& name : v.second) {
   for (int i = 0; i < dwp_param_.program_config(0).pull_dense_table_id_size();
@@ -95,7 +95,7 @@ void PullDenseWorker::CreatePinVar() {
       auto* ptr = root_scope_->Var(name + "pin");
       InitializeVariable(ptr, proto::VarType::LOD_TENSOR);
       phi::DenseTensor* pin_tensor = ptr->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       pin_tensor->mutable_data<float>(tensor->dims(),
                                       platform::CUDAPinnedPlace());
 #endif
@@ -125,7 +125,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
   }
   status_vec->resize(0);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
 
   for (size_t i = 0; i < places_.size(); ++i) {
     // for (auto& v : dense_value_names_) {
@@ -141,7 +141,7 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
         Variable* var = thread_scopes_[i]->FindVar(name);
         phi::DenseTensor* tensor = var->GetMutable<phi::DenseTensor>();
         float* w = tensor->data<float>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         memory::Copy(places_[i],
                      w,
                      platform::CUDAPinnedPlace(),
@@ -177,7 +177,7 @@ void PullDenseWorker::PullDense(bool force_update) {
         dwp_param_.program_config(0).pull_dense_table_id(i));
     if (force_update || CheckUpdateParam(tid)) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_MUSA)
       VLOG(3) << "pull dense " << force_update << " " << tid;
       fleet_ptr_->PullDenseVarsAsync(*root_scope_,
                                      tid,
diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc
index f88dbc409d1704..9f347ca4c01264 100644
--- a/paddle/fluid/framework/section_worker.cc
+++ b/paddle/fluid/framework/section_worker.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include <cfloat>
 
 #include "paddle/fluid/framework/device_worker.h"
@@ -228,7 +228,7 @@ void SectionWorker::TrainFiles() {
   int64_t max_memory_size = GetEagerDeletionThreshold();
   std::unique_ptr<GarbageCollector> gc;
   if (max_memory_size >= 0) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(place_)) {
       if (IsFastEagerDeletionModeEnabled()) {
         gc = std::make_unique<UnsafeFastGPUGarbageCollector>(place_,
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 27dc5902c75ba3..01267fd059c1f7 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -125,7 +125,7 @@ void TensorCopyImpl(const TENSOR& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -379,7 +379,7 @@ void TensorCopySync(const phi::DenseTensor& src,
         "Copy from %s to %s is not supported.", src_place, dst_place));
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cuda_pinned_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
@@ -482,7 +482,7 @@ void TensorToStream(std::ostream& os,
                       platform::errors::ResourceExhausted(
                           "tensor size %d overflow when writing tensor", size));
     if (platform::is_gpu_place(tensor.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
       std::unique_ptr<char[]> buf(new char[kBufSize]);
       auto& gpu_dev_ctx = static_cast<const phi::GPUContext&>(dev_ctx);
@@ -616,7 +616,7 @@ void TensorFromStream(std::istream& is,
     if (platform::is_gpu_place(dev_ctx.GetPlace()) ||
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(common::make_ddim(shape));
@@ -690,7 +690,7 @@ void TensorFromStream(std::istream& is,
         platform::is_xpu_place(dev_ctx.GetPlace()) ||
         platform::is_custom_place(dev_ctx.GetPlace())) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_MUSA)
       phi::DenseTensor cpu_tensor;
       cpu_tensor.Resize(common::make_ddim(dims));
       framework::VisitDataType(
@@ -812,7 +812,7 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, phi::DenseTensor* dst) {
   if (dl_tensor.device.device_type == kDLCPU) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(dl_tensor.device.device_id);
@@ -852,7 +852,7 @@ void TensorFromDLPack(const DLManagedTensor* src, phi::DenseTensor* dst) {
     void* dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place);
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (src->dl_tensor.device.device_type == kDLGPU) {
     platform::CUDAPlace dst_place =
         platform::CUDAPlace(src->dl_tensor.device.device_id);
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index d9e3e384337366..c4d9b9c143009a 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -129,7 +129,7 @@ void TensorFromArray(const T* src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -175,7 +175,7 @@ void TensorFromVector(const std::vector<T>& src,
   if (platform::is_cpu_place(dst_place)) {
     memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(dst_place)) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -304,7 +304,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
@@ -346,7 +346,7 @@ inline void TensorToVector(const phi::DenseTensor& src,
   if (platform::is_cpu_place(src.place())) {
     memory::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (platform::is_gpu_place(src.place())) {  // NOLINT
     memory::Copy(dst_place,
                  dst_ptr,
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index af7fc63a2122a8..75268cb5aea275 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -159,7 +159,7 @@ class DistMultiTrainer : public MultiTrainer {
   std::shared_ptr<paddle::framework::PullDenseWorker> pull_dense_worker_;
 };
 
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP  || defined(PADDLE_WITH_MUSA)|| \
      defined PADDLE_WITH_XPU) &&                            \
     (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
 class HeterServiceContext {
@@ -175,7 +175,7 @@ class HeterServiceContext {
   int place_num_;
   Scope* scope_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuEvent_t event_;
 #endif
   std::vector<OperatorBase*> ops_;
@@ -207,7 +207,7 @@ class HeterXpuTrainer : public TrainerBase {
   virtual std::string GetDumpPath(int tid) { return ""; }
   virtual void InitDumpEnv() {}
   template <typename T>
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void HeterMemCpy(phi::DenseTensor* tensor,
                    phi::DenseTensor* root_tensor,
                    const paddle::platform::Place& thread_place,
@@ -245,7 +245,7 @@ class HeterXpuTrainer : public TrainerBase {
   std::vector<Scope*> place_scopes_;
   BtObjectPool<HeterServiceContext> object_pool_;
   std::vector<platform::Place> places_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::vector<gpuStream_t> copy_streams_;
   std::vector<gpuEvent_t> events_;
 #endif
@@ -253,7 +253,7 @@ class HeterXpuTrainer : public TrainerBase {
 
 #endif
 
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 class PSGPUTrainer : public TrainerBase {
@@ -305,7 +305,7 @@ class PSGPUTrainer : public TrainerBase {
 };
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 class PipelineTrainer : public TrainerBase {
  public:
   PipelineTrainer() {}
diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc
index ba5dac4830aa18..aeb033649509fd 100644
--- a/paddle/fluid/framework/trainer_factory.cc
+++ b/paddle/fluid/framework/trainer_factory.cc
@@ -72,17 +72,17 @@ REGISTER_TRAINER_CLASS(DistMultiTrainer);
 REGISTER_TRAINER_CLASS(HeterPipelineTrainer);
 #endif
 
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || \
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP  || defined PADDLE_WITH_MUSA || \
      defined PADDLE_WITH_XPU) &&                            \
     (defined PADDLE_WITH_PSLIB) && (!defined(PADDLE_WITH_HETERPS))
 REGISTER_TRAINER_CLASS(HeterXpuTrainer);
 #endif
-#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || \
+#if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL || defined PADDLE_WITH_MCCL || \
      defined PADDLE_WITH_XPU_BKCL) &&                        \
     (defined PADDLE_WITH_PSLIB)
 REGISTER_TRAINER_CLASS(PSGPUTrainer);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 REGISTER_TRAINER_CLASS(PipelineTrainer);
 #endif
 }  // namespace framework
diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc
index c1f192673a7022..42471cceb30252 100644
--- a/paddle/fluid/framework/var_type_traits.cc
+++ b/paddle/fluid/framework/var_type_traits.cc
@@ -37,6 +37,13 @@
 #include "paddle/fluid/operators/miopen_rnn_cache.h"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#if defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/operators/nccl/nccl_gpu_common.h"   // NOLINT
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"  // NOLINT
+#endif
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/device/xpu/bkcl_helper.h"
 #endif
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index 9bffd125a3f3da..61790dc36e912e 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -34,6 +34,14 @@
 #include <nccl.h>
 #endif
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <mudnn.h>
+#if defined(PADDLE_WITH_MCCL)
+#include <mccl.h>
+#endif
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <miopen/miopen.h>
 #ifdef PADDLE_WITH_RCCL
@@ -60,8 +68,8 @@ class SparseCsrTensor;
 namespace paddle {
 
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 class Communicator;
 class NCCLCommunicator;
 #endif
@@ -190,13 +198,13 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
     FetchList,
     FeedList,
     operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder,
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    ncclUniqueId,
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+    mcclUniqueId,
     platform::Communicator,
     platform::NCCLCommunicator,
 #endif
-    operators::CudnnRNNCache,
+    // operators::CudnnRNNCache,
 #endif
 #if defined(PADDLE_WITH_XPU_BKCL)
     BKCLUniqueId,
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index b6d846e9a0c12d..ebf1fd4141ace0 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -97,7 +97,7 @@ cc_library(
   SRCS profiler.cc
   DEPS phi common)
 if(NOT WIN32)
-  if(WITH_NCCL OR WITH_RCCL)
+  if(WITH_NCCL OR WITH_RCCL  OR WITH_MCCL)
     cc_library(
       imperative_all_reduce
       SRCS all_reduce.cc
@@ -119,6 +119,12 @@ if(NOT WIN32)
         SRCS reducer.cc reducer.cu
         DEPS layer imperative_all_reduce)
     endif()
+    if(WITH_MCCL)
+      musa_library(
+        reducer
+        SRCS reducer.cc reducer.cu
+        DEPS layer imperative_all_reduce)
+    endif()    
   endif()
   if(WITH_XPU_BKCL)
     cc_library(
@@ -138,6 +144,7 @@ if(NOT WIN32)
     if(NOT
        (WITH_NCCL
         OR WITH_RCCL
+        OR WITH_MCCL
         OR WITH_XPU_BKCL
         OR WITH_GLOO))
       cc_library(
@@ -148,6 +155,7 @@ if(NOT WIN32)
   endif()
   if(WITH_NCCL
      OR WITH_RCCL
+     OR WITH_MCCL
      OR WITH_XPU_BKCL
      OR WITH_CUSTOM_DEVICE)
     cc_library(
@@ -169,6 +177,7 @@ if(WITH_GLOO)
      OR (NOT
          (WITH_NCCL
           OR WITH_RCCL
+          OR WITH_MCCL
           OR WITH_XPU_BKCL)
         ))
     cc_library(
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index c4bb42e4c22bb4..5436364e56f7fd 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 
 #include "paddle/fluid/imperative/all_reduce.h"
 
@@ -26,6 +26,11 @@
 #include <rccl.h>
 #endif
 
+#ifdef PADDLE_WITH_MCCL
+#include <mccl.h>
+#endif
+
+
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/variable.h"
@@ -69,16 +74,16 @@ static void AllReduce(const phi::DenseTensor &src,
   auto *dst_ptr = dst->mutable_data(src.place(), src.dtype());
   auto nccl_dtype =
       platform::ToNCCLDataType(framework::TransToProtoVarType(src.dtype()));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(src_ptr,
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(src_ptr,
                                                               dst_ptr,
                                                               src.numel(),
                                                               nccl_dtype,
-                                                              ncclSum,
+                                                              mcclSum,
                                                               comm->comm(),
                                                               stream));
 }
 
-#if NCCL_VERSION_CODE >= 2212
+// #if NCCL_VERSION_CODE >= 2212
 static void AllReduce(const phi::SelectedRows &src,
                       phi::SelectedRows *dst,
                       const ParallelStrategy &strategy,
@@ -101,7 +106,7 @@ static void AllReduce(const phi::SelectedRows &src,
   bool use_calc_stream = (dev_ctx->stream() == stream);
   VLOG(4) << "Is use calculate stream: " << use_calc_stream;
 
-  // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
+  // 1. Gather rows number from all workers. Here use mcclAllGather to do this,
   // but we can use other ways to implement is in the future
   const auto &src_rows = src.rows();
   phi::Vector<int64_t> rows_num_vector(strategy.nranks_);
@@ -114,10 +119,10 @@ static void AllReduce(const phi::SelectedRows &src,
     dev_ctx->Wait();
   }
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::ncclAllGather(gpu_rows_num_ptr + strategy.local_rank_,
+      platform::dynload::mcclAllGather(gpu_rows_num_ptr + strategy.local_rank_,
                                        gpu_rows_num_ptr,
                                        1,
-                                       ncclInt64,
+                                       mcclInt64,
                                        comm->comm(),
                                        stream));
 
@@ -163,14 +168,14 @@ static void AllReduce(const phi::SelectedRows &src,
     // allgather is used to speed up the allreduce by replacing broadcast.
     auto row_sendcount = cpu_rows_num_ptr[0];
     VLOG(3) << "allgather replaces broadcast to speed up in sparse allreduce";
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_rows_ptr,
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_rows_ptr,
                                                                 dst_rows_ptr,
                                                                 row_sendcount,
-                                                                ncclInt64,
+                                                                mcclInt64,
                                                                 comm->comm(),
                                                                 stream));
     auto value_sendcount = cpu_rows_num_ptr[0] * feature_size;
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(src_tensor_ptr,
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllGather(src_tensor_ptr,
                                                                 dst_tensor_ptr,
                                                                 value_sendcount,
                                                                 nccl_dtype,
@@ -181,10 +186,10 @@ static void AllReduce(const phi::SelectedRows &src,
       if (cpu_rows_num_ptr[i] > 0) {
         // 2. Broadcast the rows of SelectedRows
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::ncclBroadcast(src_rows_ptr,
+            platform::dynload::mcclBroadcast(src_rows_ptr,
                                              dst_rows_ptr + row_offset,
                                              cpu_rows_num_ptr[i],
-                                             ncclInt64,
+                                             mcclInt64,
                                              i,
                                              comm->comm(),
                                              stream));
@@ -192,7 +197,7 @@ static void AllReduce(const phi::SelectedRows &src,
         auto *dst_tensor_ptr_i = reinterpret_cast<uint8_t *>(dst_tensor_ptr) +
                                  row_offset * feature_size * sizeof_dtype;
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::ncclBroadcast(src_tensor_ptr,
+            platform::dynload::mcclBroadcast(src_tensor_ptr,
                                              dst_tensor_ptr_i,
                                              cpu_rows_num_ptr[i] * feature_size,
                                              nccl_dtype,
@@ -212,7 +217,7 @@ static void AllReduce(const phi::SelectedRows &src,
   VLOG(3) << "Result SelectedRows rows: "
           << string::join_strings(*dst_rows, ',');
 }
-#endif
+// #endif
 
 void AllReduce(const framework::Variable &src,
                framework::Variable *dst,
@@ -234,7 +239,7 @@ void AllReduce(const framework::Variable &src,
               dst->GetMutable<phi::DenseTensor>(),
               stream,
               comm);
-#if NCCL_VERSION_CODE >= 2212
+// #if NCCL_VERSION_CODE >= 2212
   } else if (src.IsType<phi::SelectedRows>()) {
     if (&src != dst) {
       if (!dst->IsType<phi::SelectedRows>()) {
@@ -257,7 +262,7 @@ void AllReduce(const framework::Variable &src,
       platform::GpuStreamSync(stream);
       *dst = std::move(tmp_dst);
     }
-#endif
+// #endif
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "Unsupported variable type %s for imperative allreduce, only "
diff --git a/paddle/fluid/imperative/all_reduce.h b/paddle/fluid/imperative/all_reduce.h
index 49e30549242052..049345772de65a 100644
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index 0c16a950358706..dfb231ead927ee 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -129,7 +129,7 @@ AmpOperators::AmpOperators()
       block_ops_(new std::unordered_set<std::string>()),
       unsupported_fp16_ops_(new std::unordered_set<std::string>()),
       unsupported_bf16_ops_(new std::unordered_set<std::string>()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   auto unsupported_ops_gpu_fp16 = std::get<2>(
       OpSupportedInfos("GPU", paddle::framework::proto::VarType::FP16));
   unsupported_fp16_ops_->insert(unsupported_ops_gpu_fp16.begin(),
diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc
index 4e0df45e840f25..58ecec47cccf39 100644
--- a/paddle/fluid/imperative/gloo_context.cc
+++ b/paddle/fluid/imperative/gloo_context.cc
@@ -141,7 +141,7 @@ void GLOOParallelContext::AllReduce(const phi::SelectedRows &src,
   const auto &src_tensor = src.value();
   const auto &place = src_tensor.place();
   auto dtype = framework::TransToProtoVarType(src_tensor.dtype());
-  // 1. Gather rows number from all workers. Here use ncclAllGather to do this,
+  // 1. Gather rows number from all workers. Here use mcclAllGather to do this,
   // but we can use other ways to implement is in the future
   auto &src_rows = src.rows();
   auto gloo_wrapper = framework::GlooWrapper::GetInstance();
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 267540f0807413..61bb0a1d7c14e8 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -209,7 +209,7 @@ void TensorAdd(const VarType& src, VarType* dst) {
   }
 
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_TENSOR_ADD(float, phi::GPUContext);
     PADDLE_TENSOR_ADD(double, phi::GPUContext);
     PADDLE_TENSOR_ADD(phi::dtype::float16, phi::GPUContext);
@@ -326,7 +326,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
     return;                                                              \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::GPUContext, double);
@@ -334,7 +334,7 @@ void SelectedRowsAddToTensor(const VarType& src, VarType* dst) {
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -381,7 +381,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
     return;                                                            \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::GPUContext, double);
@@ -389,7 +389,7 @@ void SelectedRowsAddTensor(const VarType& src_selected_rows_var,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(phi::CPUContext, double);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -447,7 +447,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
     return dst_var;                                                  \
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, float);
     PADDLE_SELECTED_ROWS_ADD(phi::GPUContext, double);
@@ -463,7 +463,7 @@ std::shared_ptr<ReturnVarType> SelectedRowsMerge(const VarType& src1,
 #if defined(PADDLE_WITH_XPU)
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   }
 #endif
 
@@ -734,7 +734,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -800,7 +800,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index d70d40808f915d..13a3d356e61c5b 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/imperative/nccl_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/imperative/all_reduce.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
@@ -41,10 +41,10 @@ class Variable;
 
 namespace paddle {
 namespace imperative {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 
 void NCCLParallelContext::BcastNCCLId(
-    std::vector<ncclUniqueId> &nccl_ids,  // NOLINT
+    std::vector<mcclUniqueId> &nccl_ids,  // NOLINT
     int root,
     int server_fd) {
   if (strategy_.local_rank_ == root) {
@@ -64,13 +64,13 @@ void NCCLParallelContext::BcastNCCLId(
 void NCCLParallelContext::Init() {
   int server_fd = -1;
 
-  std::vector<ncclUniqueId> nccl_ids;
+  std::vector<mcclUniqueId> nccl_ids;
   nccl_ids.resize(strategy_.nrings_);
 
   if (strategy_.local_rank_ == 0) {
     // generate the unique ncclid on the root worker
     for (auto &nccl_id : nccl_ids) {
-      platform::dynload::ncclGetUniqueId(&nccl_id);
+      platform::dynload::mcclGetUniqueId(&nccl_id);
     }
   } else {
     // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
@@ -101,12 +101,12 @@ void NCCLParallelContext::Init() {
 
 void NCCLParallelContext::InitWithRingID(int ring_id) {
   int server_fd = -1;
-  std::vector<ncclUniqueId> nccl_ids;
+  std::vector<mcclUniqueId> nccl_ids;
   nccl_ids.resize(1);
 
   if (strategy_.local_rank_ == 0) {
     // generate the unique ncclid on the root worker
-    platform::dynload::ncclGetUniqueId(&nccl_ids[0]);
+    platform::dynload::mcclGetUniqueId(&nccl_ids[0]);
   } else {
     // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server
     // on rank0.
@@ -152,7 +152,7 @@ void NCCLParallelContext::Broadcast(framework::Variable *src, int ring_id) {
   void *src_ptr = src_tensor->data();
   auto nccl_dtype = platform::ToNCCLDataType(
       framework::TransToProtoVarType(src_tensor->dtype()));
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
       src_ptr, src_tensor->numel(), nccl_dtype, 0, comm->comm(), stream));
 }
 
@@ -188,6 +188,9 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0));  
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
@@ -218,6 +221,9 @@ void NCCLParallelContext::WaitComm(int ring_id) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
   PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 7db96b2ee3d486..f71c57af3f4f6d 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 
@@ -29,6 +29,10 @@
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
 
+#ifdef PADDLE_WITH_MCCL
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
+
 #include "paddle/fluid/imperative/parallel_context.h"
 
 namespace paddle {
@@ -40,7 +44,7 @@ class Variable;
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 class NCCLParallelContext : public ParallelContext {
  public:
   explicit NCCLParallelContext(const ParallelStrategy& strategy,
@@ -49,7 +53,7 @@ class NCCLParallelContext : public ParallelContext {
 
   ~NCCLParallelContext() override = default;
 
-  void BcastNCCLId(std::vector<ncclUniqueId>& nccl_ids,
+  void BcastNCCLId(std::vector<mcclUniqueId>& nccl_ids,
                    int root,  // NOLINT
                    int server_fd);
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index d336488a42327c..1545eb0bd6e68d 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -205,7 +205,7 @@ PreparedOp PrepareImpl(
   }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (op.CanCUDNNBeUsed(dygraph_exe_ctx, expected_kernel_key.dtype())) {
     expected_kernel_key.set_backend(phi::Backend::GPUDNN);
   }
@@ -555,7 +555,7 @@ static void PreparedOpRunImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
@@ -645,7 +645,7 @@ static void PreparedOpRunPtImpl(
 
   if (FLAGS_benchmark) {
     dev_ctx->Wait();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
     VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error";
 #endif
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 4bbc52662fc96e..ef63b4a1b62d32 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -29,7 +29,7 @@
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
 // div the nranks
@@ -40,7 +40,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
           : dense_contents_.GetMutable<phi::DenseTensor>();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     DivNRanks(tensor, nranks, context);
 #endif
   } else if (platform::is_cpu_place(tensor->place())) {
@@ -228,7 +228,7 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     ConcatTensorsWithType(static_cast<const phi::GPUContext &>(context),
                           dense_tensors_,
                           &dense_contents_,
@@ -264,7 +264,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
 void Group::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     SplitTensorsWithType(static_cast<const phi::GPUContext &>(context),
                          &dense_contents_,
                          &dense_tensors_,
@@ -1020,7 +1020,7 @@ void Reducer::FinalizeBackward() {
 
   if (find_unused_vars_each_step_) {
 // TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_GLOO)
     ProcessUnusedDenseVars();
 #endif
diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu
index 59b7ecf9154230..5d89f487bc379f 100644
--- a/paddle/fluid/imperative/reducer.cu
+++ b/paddle/fluid/imperative/reducer.cu
@@ -17,7 +17,7 @@
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 void Group::DivNRanks(phi::DenseTensor *tensor,
                       int64_t nranks,
                       const platform::DeviceContext &context) {
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index 011c8871329a55..9a6e1de71fe9d2 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -44,7 +44,7 @@ class VariableWrapper;
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 0f992c9b8be309..d01fefc7795943 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -137,7 +137,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
   if (gcs_.count(place) == 0) {
     std::unique_ptr<framework::GarbageCollector> gc;
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gc = std::make_unique<framework::DefaultStreamGarbageCollector>(place, 0);
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -147,7 +147,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
           "Please recompile or reinstall Paddle with GPU support."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gc = std::make_unique<framework::CUDAPinnedGarbageCollector>(place, 0);
 
       VLOG(10) << "Created GarbageCollector at " << place;
@@ -309,7 +309,7 @@ void Tracer::TraceOpImpl(const std::string& type,
 
   try {
     if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::SetDeviceId(place.device);
 #else
       PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 221e6b7de1abfe..302bc160c99387 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -38,7 +38,7 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
@@ -215,7 +215,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
       argument->scope_valid(),
       true,
       platform::errors::PreconditionNotMet("The scope field should be valid"));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (argument->use_gpu_valid()) {
     CopyParamsToGpu(argument);
   }
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index ee29af1c13308b..6ab7d83b8922d2 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -32,7 +32,7 @@ class IrParamsSyncAmongDevicesPass : public AnalysisPass {
   std::string repr() const override;
 
  private:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void CopyParamsToGpu(Argument *argument);
 #endif
 
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 94e71f1cfddf16..9cec6ac6878dc2 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -32,7 +32,7 @@
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DECLARE_uint64(initial_gpu_memory_in_mb);
 #endif
 
@@ -100,7 +100,7 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
                                   int device_id,
                                   Precision precision_mode) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   use_gpu_ = true;
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
@@ -641,7 +641,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 }
 
 void AnalysisConfig::EnableCUDNN() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   use_cudnn_ = use_gpu_;
 #else
   LOG(ERROR) << "Please compile with CUDA first to use cuDNN";
@@ -996,7 +996,7 @@ void AnalysisConfig::Update() {
   }
 
   if (use_gpu() && use_cudnn_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (!enable_ir_optim_) {
       LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled.";
     } else {
@@ -1212,7 +1212,7 @@ void AnalysisConfig::SetCpuMathLibraryNumThreads(
 }
 
 float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Get the GPU memory details and calculate the fraction of memory for the
   // GPU memory pool.
   size_t gpu_total, gpu_available;
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 476c78638c47fc..b8d95d712bdd82 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -120,7 +120,7 @@ PHI_DECLARE_bool(pir_apply_inplace_pass);
 
 namespace paddle {
 namespace {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
                                 GPUContextResource *gpu_resource,
                                 Place place_) {
@@ -152,7 +152,7 @@ void UpdatePrivateDeviceContext(InferGPUContext *gpu_context,
   gpu_context->SetBlasTF32Handle(
       gpu_resource->GetBlasTF32TensorCoreHandleCreator());
   gpu_context->SetDnnHandle(gpu_resource->GetDnnHandleCreator());
-  gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
+  // gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandleCreator());
   gpu_context->SetSparseHandle(gpu_resource->GetSparseHandleCreator());
   gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
 
@@ -292,7 +292,7 @@ bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
                       false,
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
     auto dst_gpu_place = place;
@@ -424,7 +424,7 @@ bool AnalysisPredictor::Init(
     return true;
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // TODO(inference): Now only gpu with external stream support private
   // device_context.
   if (config_.use_gpu_ && config_.use_external_stream_) {
@@ -472,7 +472,7 @@ void AnalysisPredictor::InitPlace() {
                       platform::errors::InvalidArgument(
                           "Only one choice can be made between CPU and XPU."));
     place_ = paddle::platform::CUDAPlace(config_.gpu_device_id());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (config_.thread_local_stream_enabled()) {
       LOG_FIRST_N(WARNING, 1) << "We will remove this interface in the future. "
                                  "Please use config.SetExecStream instead.";
@@ -543,14 +543,14 @@ void AnalysisPredictor::InitPlace() {
 }
 
 void AnalysisPredictor::InitResourceManager(void *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   predictor_stream_ =
       ResourceManager::Instance().InitGPUResource(place_, stream);
 #endif
 }
 
 void AnalysisPredictor::InitDeviceContexts() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Init GPUContext.
   if (place_.GetType() == phi::AllocationType::GPU) {
     device_contexts_.emplace(
@@ -598,7 +598,7 @@ void AnalysisPredictor::InitDeviceContexts() {
 }
 
 void *AnalysisPredictor::GetExecStream() const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (place_.GetType() == phi::AllocationType::GPU) {
     if (private_context_) {
       return predictor_stream_;
@@ -2315,7 +2315,7 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (!private_context_) {
     PADDLE_THROW(platform::errors::Fatal(
@@ -2326,6 +2326,8 @@ bool AnalysisPredictor::ExpRunWithExternalStream(const gpuStream_t stream) {
   if (stream != predictor_stream_) {
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
+#elif defined(PADDLE_WITH_MUSA)
+    musaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #else
     cudaStreamSynchronize(static_cast<gpuStream_t>(predictor_stream_));
 #endif
@@ -2365,11 +2367,13 @@ void AnalysisPredictor::HookCollectShapeRangeInfo() {
     paddle::platform::DeviceContextPool &pool =
         paddle::platform::DeviceContextPool::Instance();
     if (config_.use_gpu()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto *dev_ctx = pool.Get(place_);
       auto stream = static_cast<phi::GPUContext *>(dev_ctx)->stream();
 #ifdef PADDLE_WITH_HIP
       hipStreamSynchronize(stream);
+#elif defined(PADDLE_WITH_MUSA)
+      musaStreamSynchronize(stream);
 #else
       cudaStreamSynchronize(stream);
 #endif
@@ -2764,7 +2768,7 @@ AnalysisPredictor::~AnalysisPredictor() {  // NOLINT
   if (config_.shape_range_info_collected()) {
     StatisticShapeRangeInfo();
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (predictor_stream_ != nullptr) {
     ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
   }
@@ -3330,6 +3334,15 @@ bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
   return false;
 }
 
+bool InternalUtils::RunWithExternalStream(paddle_infer::Predictor *p,
+                                          musaStream_t stream) {
+#ifdef PADDLE_WITH_MUSA
+  auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
+  return pred->ExpRunWithExternalStream(stream);
+#endif
+  return false;
+}
+
 bool InternalUtils::RunWithRuntimeConfig(paddle_infer::Predictor *p,
                                          void *config) {
   auto pred = dynamic_cast<paddle::AnalysisPredictor *>(p->predictor_.get());
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 4a5cfb229a459e..6725915a2c00c3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -208,7 +208,7 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   bool ZeroCopyRun() override;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // Note: Can only be used under thread_local semantics.
   bool ExpRunWithExternalStream(const gpuStream_t stream);
 #endif
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index d886885edb5ba5..3c26f329d4747d 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -250,7 +250,7 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
           false,
           platform::errors::InvalidArgument(
               "Only one choice can be made between CPU and XPU."));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::DeviceContextPool &pool =
           platform::DeviceContextPool::Instance();
       auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place_));
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index eee3a707a03b14..530bc6f8a3eda7 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -110,7 +110,7 @@ T *Tensor::mutable_data(PlaceType place) {
       return tensor->mutable_data<T>(paddle::platform::CPUPlace());
     }
     case static_cast<int>(PlaceType::kGPU): {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       paddle::platform::CUDAPlace gpu_place(device_);
       auto *dev_ctxs = reinterpret_cast<const std::map<
           phi::Place,
@@ -208,7 +208,7 @@ void Tensor::CopyFromCpu(const T *data) {
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
     paddle::platform::CUDAPlace gpu_place(device_);
     auto *dev_ctxs = reinterpret_cast<const std::map<
@@ -424,7 +424,7 @@ void Tensor::CopyToCpuImpl(T *data,
         "with IPU."));
 #endif
   } else if (place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto gpu_place = t_place;
     auto *dev_ctxs = reinterpret_cast<const std::map<
         phi::Place,
@@ -440,6 +440,17 @@ void Tensor::CopyToCpuImpl(T *data,
                          dev_ctx->stream());
 #ifdef PADDLE_WITH_HIP
     hipStreamSynchronize(dev_ctx->stream());
+#elif defined(PADDLE_WITH_MUSA)
+    // async, return stream
+    if (nullptr != exec_stream) {
+      *(static_cast<musaStream_t *>(exec_stream)) = dev_ctx->stream();
+      // async with callback
+    } else if (cb) {
+      musaLaunchHostFunc(dev_ctx->stream(), cb, cb_params);
+      // sync
+    } else {
+      musaStreamSynchronize(dev_ctx->stream());
+    }
 #else
     // async, return stream
     if (nullptr != exec_stream) {
@@ -857,7 +868,7 @@ void InternalUtils::CopyFromCpuWithIoStream(paddle_infer::Tensor *t,
     auto *t_data = tensor->mutable_data<T>(paddle::platform::CPUPlace());
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     paddle::platform::CUDAPlace gpu_place(t->device_);
     auto *t_data = tensor->mutable_data<T>(gpu_place);
     paddle::memory::Copy(gpu_place,
@@ -927,7 +938,7 @@ void InternalUtils::CopyToCpuWithIoStream(paddle_infer::Tensor *t,
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #endif
   } else if (t->place_ == PlaceType::kGPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data),
                          t_place,
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
index 7879adb57d86ef..d0bad85bfdee13 100644
--- a/paddle/fluid/inference/api/infer_context.cc
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -22,7 +22,7 @@
 
 namespace paddle {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 InferGPUContext::InferGPUContext(const phi::Place& place)
     : phi::GPUContext(place, false) {}
 #endif
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
index 216c7747f07065..518a85119ed792 100644
--- a/paddle/fluid/inference/api/infer_context.h
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -26,7 +26,7 @@ class InferCPUContext : public phi::CPUContext {
   using phi::CPUContext::SetEigenDevice;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class InferGPUContext : public phi::GPUContext {
  public:
   explicit InferGPUContext(const phi::Place& place);
@@ -35,7 +35,7 @@ class InferGPUContext : public phi::GPUContext {
   using phi::GPUContext::SetBlasTF32Handle;
   using phi::GPUContext::SetDnnHandle;
   using phi::GPUContext::SetEigenDevice;
-  using phi::GPUContext::SetSolverHandle;
+  // using phi::GPUContext::SetSolverHandle;
   using phi::GPUContext::SetSparseHandle;
   using phi::GPUContext::SetStream;
   // using phi::GPUContext::SetDnnWorkspaceHandle;
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 3fefba9ef22be8..10e6d38e5a900d 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -470,6 +470,7 @@ PD_INFER_DECL std::shared_ptr<framework::Cipher> MakeCipher(
 // forward declation
 using cudaStream_t = struct CUstream_st*;
 using hipStream_t = struct ihipStream_t*;
+using musaStream_t = struct MUstream_st*;
 
 namespace paddle_infer {
 class Predictor;
@@ -507,6 +508,8 @@ class PD_INFER_DECL InternalUtils {
                                     cudaStream_t stream);
   static bool RunWithExternalStream(paddle_infer::Predictor* pred,
                                     hipStream_t stream);
+  static bool RunWithExternalStream(paddle_infer::Predictor* pred,
+                                    musaStream_t stream);                                    
   static bool RunWithRuntimeConfig(paddle_infer::Predictor* pred, void* config);
 
   static void UpdateConfigInterleaved(paddle_infer::Config* c,
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 4af87b029fd22f..9aaa2184875dc7 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -16,7 +16,10 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #endif
-#ifdef PADDLE_WITH_HIP
+#ifdef PADDLE_WITH_MUSA
+#include <mudnn.h>
+#endif
+#ifdef PADDLE_WITH_HIP 
 #include <miopen/miopen.h>
 #endif
 #ifdef PADDLE_WITH_TENSORRT
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index 2a8029555e94f5..96676ff818c56c 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -44,7 +44,7 @@
 namespace paddle {
 namespace internal {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
 class EigenGpuStreamDevice : public Eigen::StreamInterface {
  public:
   EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
@@ -102,6 +102,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
@@ -132,7 +135,7 @@ void CPUContextResource::InitCPUResource() {
 
 CPUContextResource::CPUContextResource() { InitCPUResource(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 GPUContextResource::GPUContextResource(const phi::Place& place, void* stream)
     : place_(place) {
   InitGPUResource(stream);
@@ -158,6 +161,8 @@ void GPUContextResource::DestroyGPUResource() {
   if (owned_stream_) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream_));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
 #endif
@@ -166,8 +171,8 @@ void GPUContextResource::DestroyGPUResource() {
 
   DestroyDnnHandle();
   DestroyBlasHandle();
-  DestroyBlasLtHandle();
-  DestroySolverHandle();
+  // DestroyBlasLtHandle();
+  // DestroySolverHandle();
   DestroySparseHandle();
 }
 
@@ -205,21 +210,21 @@ void GPUContextResource::DestroyBlasHandle() {
   phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
 }
 
-void GPUContextResource::InitBlasLtHandle() {
-  phi::InitBlasLtHandle(&blaslt_handle_);
-}
+// void GPUContextResource::InitBlasLtHandle() {
+//   phi::InitBlasLtHandle(&blaslt_handle_);
+// }
 
-void GPUContextResource::DestroyBlasLtHandle() {
-  phi::DestroyBlasLtHandle(blaslt_handle_);
-}
+// void GPUContextResource::DestroyBlasLtHandle() {
+//   phi::DestroyBlasLtHandle(blaslt_handle_);
+// }
 
-void GPUContextResource::InitSolverHandle() {
-  phi::InitSolverHandle(&solver_handle_, stream_);
-}
+// void GPUContextResource::InitSolverHandle() {
+//   phi::InitSolverHandle(&solver_handle_, stream_);
+// }
 
-void GPUContextResource::DestroySolverHandle() {
-  phi::DestroySolverHandle(solver_handle_);
-}
+// void GPUContextResource::DestroySolverHandle() {
+//   phi::DestroySolverHandle(solver_handle_);
+// }
 
 void GPUContextResource::InitSparseHandle() {
   phi::InitSparseHandle(&sparse_handle_, stream_);
@@ -287,29 +292,29 @@ GPUContextResource::GetBlasTF32TensorCoreHandleCreator() {
   };
 }
 
-blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
-  return blaslt_handle_;
-}
+// blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
+//   return blaslt_handle_;
+// }
 
-std::function<phi::blasLtHandle_t()>
-GPUContextResource::GetBlasLtHandleCreator() {
-  return [&]() {
-    InitBlasLtHandle();
-    return blaslt_handle_;
-  };
-}
+// std::function<phi::blasLtHandle_t()>
+// GPUContextResource::GetBlasLtHandleCreator() {
+//   return [&]() {
+//     InitBlasLtHandle();
+//     return blaslt_handle_;
+//   };
+// }
 
-phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
-  return solver_handle_;
-}
+// phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
+//   return solver_handle_;
+// }
 
-std::function<phi::solverHandle_t()>
-GPUContextResource::GetSolverDnHandleCreator() {
-  return [&]() {
-    InitSolverHandle();
-    return solver_handle_;
-  };
-}
+// std::function<phi::solverHandle_t()>
+// GPUContextResource::GetSolverDnHandleCreator() {
+//   return [&]() {
+//     InitSolverHandle();
+//     return solver_handle_;
+//   };
+// }
 
 phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
   return sparse_handle_;
@@ -380,7 +385,7 @@ CPUContextResource* ResourceManager::GetCPUResource() const {
   return cpu_resource_.get();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
   std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
   if (gpu_resources_.count(stream)) {
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 1f4d4ea420e1b6..96d534e8cc9540 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -26,7 +26,7 @@
 #include "paddle/utils/test_macros.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -50,7 +50,7 @@ class CPUContextResource {
   std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUContextResource {
  public:
   explicit GPUContextResource(const phi::Place& place, void* stream);
@@ -61,8 +61,8 @@ class GPUContextResource {
   std::function<phi::blasHandle_t()> GetBlasHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTensorCoreHandleCreator();
   std::function<phi::blasHandle_t()> GetBlasTF32TensorCoreHandleCreator();
-  std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
-  std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
+  // std::function<phi::blasLtHandle_t()> GetBlasLtHandleCreator();
+  // std::function<phi::solverHandle_t()> GetSolverDnHandleCreator();
   std::function<phi::sparseHandle_t()> GetSparseHandleCreator();
   std::function<Eigen::GpuDevice*()> GetGpuEigenDeviceCreator();
 
@@ -71,8 +71,8 @@ class GPUContextResource {
   blasHandle_t GetBlasHandle() const;
   blasHandle_t GetBlasTensorCoreHandle() const;
   blasHandle_t GetBlasTF32Handle() const;
-  blasLtHandle_t GetBlasLtHandle() const;
-  phi::solverHandle_t GetSolverDnHandle() const;
+  // blasLtHandle_t GetBlasLtHandle() const;
+  // phi::solverHandle_t GetSolverDnHandle() const;
   phi::sparseHandle_t GetSparseHandle() const;
   Eigen::GpuDevice* GetGpuEigenDevice() const;
   int GetGpuComputeCapability() const;
@@ -91,10 +91,10 @@ class GPUContextResource {
   void InitDnnHanlde();
   void DestroyDnnHandle();
   void DestroyBlasHandle();
-  void InitBlasLtHandle();
-  void DestroyBlasLtHandle();
-  void InitSolverHandle();
-  void DestroySolverHandle();
+  // void InitBlasLtHandle();
+  // void DestroyBlasLtHandle();
+  // void InitSolverHandle();
+  // void DestroySolverHandle();
   void InitSparseHandle();
   void DestroySparseHandle();
 
@@ -117,9 +117,9 @@ class GPUContextResource {
   blasHandle_t blas_handle_{nullptr};
   blasHandle_t blas_tensor_core_handle_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
-  blasLtHandle_t blaslt_handle_{nullptr};
+  // blasLtHandle_t blaslt_handle_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
-  phi::solverHandle_t solver_handle_{nullptr};
+  // phi::solverHandle_t solver_handle_{nullptr};
   phi::sparseHandle_t sparse_handle_{nullptr};
   // DnnWorkspaceHandle
 };
@@ -139,7 +139,7 @@ class ResourceManager {
   std::mutex cpu_mutex_;
   std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // GPU Resource
  public:
   void* InitGPUResource(const phi::Place& place, void* stream);
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index 9b36b6dc745e85..f3c953fb60a97e 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -127,7 +127,7 @@ void MemoryCopyAsync(const platform::Place& dst_place,
   if (platform::is_cpu_place(dst_place) && platform::is_cpu_place(src_place)) {
     memory::Copy(cpu_place, dst_data, cpu_place, src_data, size);
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_cpu_place(dst_place) &&
         platform::is_gpu_place(src_place)) {
       PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
index b3b0cd35fb300b..6da8e874adc813 100644
--- a/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/c_allreduce_op_plugin.cu
@@ -19,7 +19,7 @@
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/utils.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
@@ -30,13 +30,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 #if defined(PADDLE_WITH_NCCL)
-inline ncclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) {
+inline mcclDataType_t NvInferDtypeToNCCLDType(nvinfer1::DataType type) {
   if (type == nvinfer1::DataType::kFLOAT) {
-    return ncclFloat;
+    return mcclFloat;
   } else if (type == nvinfer1::DataType::kHALF) {
-    return ncclFloat16;
+    return mcclFloat16;
   } else if (type == nvinfer1::DataType::kINT8) {
-    return ncclInt8;
+    return mcclInt8;
   } else if (type == nvinfer1::DataType::kINT32) {
     return ncclInt32;
   } else {
@@ -159,23 +159,23 @@ int CAllReducePluginDynamic::enqueue(
   auto input_type = input_desc[0].type;
   void* sendbuff = const_cast<void*>(inputs[0]);
   void* recvbuff = outputs[0];
-  ncclDataType_t dtype = NvInferDtypeToNCCLDType(input_type);
-  ncclRedOp_t nccl_red_type = ncclSum;
+  mcclDataType_t dtype = NvInferDtypeToNCCLDType(input_type);
+  mcclRedOp_t nccl_red_type = mcclSum;
   switch (red_type_) {
     case kRedSum:
-      nccl_red_type = ncclSum;
+      nccl_red_type = mcclSum;
       break;
 
     case kRedMax:
-      nccl_red_type = ncclMax;
+      nccl_red_type = mcclMax;
       break;
 
     case kRedMin:
-      nccl_red_type = ncclMin;
+      nccl_red_type = mcclMin;
       break;
 
     case kRedProd:
-      nccl_red_type = ncclProd;
+      nccl_red_type = mcclProd;
       break;
 
     default:
@@ -202,9 +202,9 @@ int CAllReducePluginDynamic::enqueue(
                           "NCCLCommContext is nullptr, collective op should "
                           "has ring_id attr."));
     auto stream = comm_ctx->GetStream();
-    ncclRedOp_t nccl_red_type = ncclSum;
+    mcclRedOp_t nccl_red_type = mcclSum;
     // comm_ctx->AllReduce(&inputs[0], inputs[0], nccl_red_type, stream);
-    phi::dynload::ncclAllReduce(sendbuff,
+    phi::dynload::mcclAllReduce(sendbuff,
                                 recvbuff,
                                 numel,
                                 dtype,
@@ -215,7 +215,7 @@ int CAllReducePluginDynamic::enqueue(
   } else {
     auto comm = platform::NCCLCommContext::Instance().Get(ring_id_);
     cudaStream_t custream = use_calc_stream_ ? stream : comm->stream();
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff,
                                                                 recvbuff,
                                                                 numel,
                                                                 dtype,
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 298f54de48e8f3..fec0a927b20e8b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -218,6 +218,9 @@ void QkvToContextPluginDynamic::configurePlugin(
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));          
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(fake_qk_bias_, 0, size, dev_ctx.stream()));
diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt
index 5b49d927ae6762..aed5d674e49ff5 100644
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -62,6 +62,17 @@ if(WITH_ROCM)
     DEPS malloc gpu_info place)
 endif()
 
+if(WITH_MUSA)
+  musa_test(
+    malloc_test
+    SRCS malloc_test.cu
+    DEPS device_context malloc)
+  musa_test(
+    cuda_managed_memory_test
+    SRCS cuda_managed_memory_test.cu
+    DEPS malloc gpu_info place)
+endif()
+
 if(WITH_TESTING AND TEST cuda_managed_memory_test)
   set_tests_properties(
     cuda_managed_memory_test
diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index ffce57d78f1642..eae17991ff2fe5 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -19,7 +19,7 @@ set(ALLOCATOR_SRCS
     buddy_allocator.cc
     system_allocator.cc)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   list(
     APPEND
     ALLOCATOR_SRCS
@@ -90,6 +90,13 @@ if(WITH_ROCM)
     SRCS thread_local_allocator_test.cc
     DEPS allocator)
 endif()
+if(WITH_MUSA)
+  musa_test(
+    thread_local_allocator_test
+    SRCS thread_local_allocator_test.cc
+    DEPS allocator)
+endif()
+
 
 if(WITH_GPU)
   nv_test(
@@ -101,6 +108,11 @@ elseif(WITH_ROCM)
     best_fit_allocator_test
     SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
     DEPS allocator memcpy)
+elseif(WITH_MUSA)
+  musa_test(
+    best_fit_allocator_test
+    SRCS best_fit_allocator_test.cc best_fit_allocator_test.cu
+    DEPS allocator memcpy)
 else()
   cc_test_old(best_fit_allocator_test SRCS best_fit_allocator_test.cc DEPS
               allocator)
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index dd86ba9855fbab..17839ecf0caecc 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -26,9 +26,9 @@
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/flags.h"
 
-#ifdef PADDLE_WITH_NCCL
-#include <nccl.h>
-#include "paddle/fluid/platform/dynload/nccl.h"
+#ifdef PADDLE_WITH_MCCL
+#include <mccl.h>
+#include "paddle/fluid/platform/dynload/mccl.h"
 #endif
 
 PHI_DECLARE_string(allocator_strategy);
@@ -144,22 +144,22 @@ using DecoratedAllocationPtr =
 
 template <typename T>
 static T&& FillValue(T&& allocation) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_MUSA)
   if (allocation != nullptr) {
     if (FLAGS_sync_after_alloc || FLAGS_alloc_fill_value >= 0) {
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+      PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
       if (FLAGS_alloc_fill_value >= 0) {
         VLOG(10) << "Set " << FLAGS_alloc_fill_value << " on "
                  << allocation->ptr() << " " << allocation->place() << " "
                  << allocation->size();
         if (platform::is_gpu_place(allocation->place())) {
-          PADDLE_ENFORCE_GPU_SUCCESS(cudaMemset(
+          PADDLE_ENFORCE_GPU_SUCCESS(musaMemset(
               allocation->ptr(), FLAGS_alloc_fill_value, allocation->size()));
         } else {
           std::memset(
               allocation->ptr(), FLAGS_alloc_fill_value, allocation->size());
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+        PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
       }
     }
   }
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 59ab4eaf154724..e7df0f7213363f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include <shared_mutex>
 
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
@@ -165,7 +165,7 @@ class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   using CUDAAllocatorMap =
       std::map<platform::CUDAPlace,
                std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
@@ -193,7 +193,7 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -219,7 +219,7 @@ class AllocatorFacadePrivate {
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         allow_free_idle_chunk_ = allow_free_idle_chunk;
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
@@ -294,7 +294,7 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
@@ -353,7 +353,7 @@ class AllocatorFacadePrivate {
            LIKELY(FLAGS_use_system_allocator == false);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   bool HasCUDAAllocator(const platform::CUDAPlace& place, gpuStream_t stream) {
     auto it = cuda_allocators_.find(place);
     if (it == cuda_allocators_.end()) {
@@ -730,7 +730,7 @@ class AllocatorFacadePrivate {
 #endif
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void InitNaiveBestFitCUDAPinnedAllocator() {
     if (FLAGS_use_auto_growth_pinned_allocator) {
       auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
@@ -804,7 +804,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto cuda_allocator = CreateCUDAAllocator(p);
     cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
@@ -890,7 +890,7 @@ class AllocatorFacadePrivate {
     auto chunk_size = FLAGS_auto_growth_chunk_size_in_mb << 20;
     VLOG(4) << "FLAGS_auto_growth_chunk_size_in_mb is "
             << FLAGS_auto_growth_chunk_size_in_mb;
-#if defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto cuda_allocator = CreateCUDAAllocator(p);
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         cuda_allocator,
@@ -1252,7 +1252,7 @@ class AllocatorFacadePrivate {
       system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     system_allocators_[platform::CUDAPinnedPlace()] =
         std::make_shared<CPUPinnedAllocator>();
     int device_count = platform::GetGPUDeviceCount();
@@ -1276,7 +1276,7 @@ class AllocatorFacadePrivate {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
@@ -1322,7 +1322,7 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe(allocators_);
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (is_stream_safe_cuda_allocator_used_) {
       CheckCUDAAllocThreadSafe(cuda_allocators_);
     }
@@ -1355,7 +1355,7 @@ class AllocatorFacadePrivate {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // a standalone CUDA allocator to support multi-stream GC in new executor
   std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
       default_stream_safe_cuda_allocators_;
@@ -1489,7 +1489,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
     }
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   AllocatorFacadePrivate* m = GetPrivate();
   if (!m->IsStreamSafeCUDAAllocatorUsed()) {
     VLOG(6) << "Warning: StreamSafeCUDAAllocator is not used!";
@@ -1515,7 +1515,7 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
 bool AllocatorFacade::InSameStream(
     const std::shared_ptr<phi::Allocation>& allocation,
     const phi::Stream& stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
   return s == GetStream(allocation);
 #else
@@ -1527,7 +1527,7 @@ bool AllocatorFacade::IsStreamSafeCUDAAllocatorUsed() {
   return GetPrivate()->IsStreamSafeCUDAAllocatorUsed();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
                                   gpuStream_t stream) {
   AllocatorFacadePrivate* m = GetPrivate();
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index acfd73a411932f..39819e0d66bdc9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -81,7 +81,7 @@ class AllocatorFacade {
 
   bool IsStreamSafeCUDAAllocatorUsed();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // TODO(zhiqiu): change gpuStream_t to phi::Stream if needed.
   uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
   void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/allocation/buddy_allocator.cc b/paddle/fluid/memory/allocation/buddy_allocator.cc
index 4f08db4921f8ba..0f532d1fff4d78 100644
--- a/paddle/fluid/memory/allocation/buddy_allocator.cc
+++ b/paddle/fluid/memory/allocation/buddy_allocator.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define USE_DEVICE
 PHI_DECLARE_uint64(reallocate_gpu_memory_in_mb);
 #endif
@@ -54,7 +54,7 @@ BuddyAllocator::BuddyAllocator(
     };
     use_custom_device_ = true;
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     init_allocate_size_func_ = &platform::GpuInitAllocSize;
     re_allocate_size_func_ = &platform::GpuReallocSize;
 #endif
@@ -279,7 +279,7 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
   allocate_bytes = DeviceAllocateSize(
       init_allocate_size_func_, re_allocate_size_func_, request_bytes);
 #else
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   allocate_bytes = DeviceAllocateSize(
       &platform::GpuInitAllocSize, &platform::GpuReallocSize, request_bytes);
 #endif
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index 781addd7dba60b..3f50fa9651ced2 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -23,6 +23,10 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
+
 #include <string>
 
 #include "paddle/fluid/platform/cuda_device_guard.h"
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 7286f84160c6ad..139e2358d161c8 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -82,6 +82,9 @@ class GPUContextAllocator : public Allocator {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreateWithFlags(&event_, musaEventDisableTiming));        
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreate(&event_, cudaEventDisableTiming));
@@ -92,8 +95,9 @@ class GPUContextAllocator : public Allocator {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-
       PADDLE_WARN_GPU_SUCCESS(hipEventDestroy(event_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_WARN_GPU_SUCCESS(musaEventDestroy(event_));         
 #else
       PADDLE_WARN_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
@@ -113,6 +117,9 @@ class GPUContextAllocator : public Allocator {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(default_stream_, event_, 0));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0));
diff --git a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
index 77ca495cacbc70..331fe723d32bb9 100644
--- a/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_managed_allocator.cc
@@ -19,6 +19,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index d39cb285517f2c..c8ac552bf1b73a 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -26,7 +26,7 @@
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/common/place.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/platform/flags.h"
@@ -213,7 +213,7 @@ size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 }
 
 // For CUDA
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUBuddyAllocatorList {
  private:
   GPUBuddyAllocatorList() : devices_(platform::GetSelectedDevices()) {
@@ -283,7 +283,7 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
 
 template <>
 size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP)
+#if (defined PADDLE_WITH_CUDA || defined PADDLE_WITH_HIP || defined PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -294,7 +294,7 @@ size_t Used<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 template <>
 void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                  size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   auto *buddy_allocator = GetGPUBuddyAllocator(place.device);
   auto *ptr = buddy_allocator->Alloc(size);
   if (ptr == nullptr) {
@@ -315,6 +315,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
     if (FLAGS_init_allocated_mem) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(ptr, 0xEF, size);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(ptr, 0xEF, size);
 #else
       cudaMemset(ptr, 0xEF, size);
 #endif
@@ -331,7 +333,7 @@ template <>
 void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
                                void *p,
                                size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   GetGPUBuddyAllocator(place.device)->Free(p);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -341,7 +343,7 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
 
 template <>
 uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return GetGPUBuddyAllocator(place.device)->Release();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -349,7 +351,7 @@ uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
   static BuddyAllocator *ba = nullptr;
@@ -367,7 +369,7 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
 
 template <>
 size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return GetCUDAPinnedBuddyAllocator()->Used();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -378,7 +380,7 @@ size_t Used<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place) {
 template <>
 void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                        size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   auto *buddy_allocator = GetCUDAPinnedBuddyAllocator();
   void *ptr = buddy_allocator->Alloc(size);
@@ -400,7 +402,7 @@ template <>
 void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
                                      void *p,
                                      size_t size) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Free " << size << " bytes on " << platform::Place(place);
   GetCUDAPinnedBuddyAllocator()->Free(p);
 #else
@@ -412,7 +414,7 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 template <>
 uint64_t Release<platform::CUDAPinnedPlace>(
     const platform::CUDAPinnedPlace &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   VLOG(10) << "Release on " << platform::Place(place);
   return GetCUDAPinnedBuddyAllocator()->Release();
 #else
@@ -603,7 +605,7 @@ size_t Usage::operator()(const platform::CPUPlace &cpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return Used(gpu);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
@@ -612,7 +614,7 @@ size_t Usage::operator()(const platform::CUDAPlace &gpu) const {
 }
 
 size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return Used(cuda_pinned);
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 32853f08f94e5a..206ad954468010 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -23,6 +23,8 @@ bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
 void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaFreeHost(allocation->ptr()));  
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
@@ -38,6 +40,8 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
+#elif defined(PADDLE_WITH_MUSA)  
+  PADDLE_ENFORCE_GPU_SUCCESS(musaHostAlloc(&ptr, size, musaHostAllocPortable));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 48b18f07456c66..30fe2d9b095eb7 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -92,6 +92,17 @@ bool StreamSafeCUDAAllocation::CanBeFreed() {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(err);
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
+    
+#elif defined(PADDLE_WITH_MUSA)
+    gpuError_t err = musaEventQuery(event);
+    if (err == musaErrorNotReady) {
+      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
+      // Erase the completded event before "it"
+      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
+      return false;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));    
 #else
     gpuError_t err = hipEventQuery(event);
     if (err == hipErrorNotReady) {
@@ -128,6 +139,9 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#elif defined (PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaEventCreateWithFlags(&new_event, musaEventDisableTiming));        
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
@@ -142,6 +156,8 @@ void StreamSafeCUDAAllocation::RecordStreamWithNoGraphCapturing(
 
 #ifdef PADDLE_WITH_CUDA
   PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#elif defined (PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(record_event, stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
 #endif
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index 31508a10799228..79a7c7abf01de2 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -24,6 +24,9 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
+#include <musa.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index e9a9fcbff9831e..cb9c4afd7b9fcf 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/flags.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 
@@ -120,7 +120,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 void* GPUAllocator::Alloc(size_t* index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
@@ -216,6 +216,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
 // PINNED memory is visible to all CUDA contexts.
 #ifdef PADDLE_WITH_HIP
   hipError_t result = hipHostMalloc(&p, size, hipHostMallocPortable);
+#elif defined(PADDLE_WITH_MUSA)  
+  musaError_t result = musaHostAlloc(&p, size, musaHostAllocPortable);
 #else
   cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable);
 #endif
@@ -259,6 +261,22 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
         platform::errors::Fatal(
             "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
   }
+#elif defined(PADDLE_WITH_MUSA)
+  err = musaFreeHost(p);
+
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFreeHost after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFreeHost succeeds.
+  if (err != musaErrorMusartUnloading) {
+    PADDLE_ENFORCE_EQ(
+        err,
+        0,
+        platform::errors::Fatal(
+            "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
+            err));
+  }
 #else
   err = cudaFreeHost(p);
 
diff --git a/paddle/fluid/memory/allocation/system_allocator.h b/paddle/fluid/memory/allocation/system_allocator.h
index 67376a3e39a224..b2cce04a04d37e 100644
--- a/paddle/fluid/memory/allocation/system_allocator.h
+++ b/paddle/fluid/memory/allocation/system_allocator.h
@@ -43,7 +43,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUAllocator : public SystemAllocator {
  public:
   explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {}
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c40da19d47e5f..63504621f98c5b 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -57,7 +57,7 @@ void* GetBasePtr(const std::shared_ptr<Allocation>& allocation) {
   return allocation::AllocatorFacade::Instance().GetBasePtr(allocation);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream) {
   return allocation::AllocatorFacade::Instance().Release(place, stream);
 }
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 3b098e5a13e515..48fbc541e5fa91 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -49,7 +49,7 @@ extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
 
 extern void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 extern uint64_t Release(const platform::CUDAPlace& place, gpuStream_t stream);
 
 void RecordStream(std::shared_ptr<Allocation> allocation, gpuStream_t stream);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index bffbcbdfad76bc..c8ce60e7c39d6e 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/common/place.h"
-#include "paddle/utils/test_macros.h"
 
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
@@ -111,11 +110,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
 #endif  // PADDLE_WITH_CUSTOM_DEVICE
 
 template <>
-TEST_API void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
-                                                           void* dst,
-                                                           platform::CPUPlace,
-                                                           const void* src,
-                                                           size_t num) {
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
+                                                  void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src,
+                                                  size_t num) {
   if (UNLIKELY(num == 0)) return;
   VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
   std::memcpy(dst, src, num);
@@ -257,7 +256,8 @@ void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
 
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024;  // 64K
 
 #ifdef PADDLE_WITH_HIP
@@ -272,10 +272,22 @@ inline void SyncCUDAStream() {
   }
 #endif
 }
+#elif defined(PADDLE_WITH_MUSA)
+inline void SyncCUDAStream() {
+#if !defined(_WIN32)
+  musaStreamSynchronize(0);
+#else
+  musaError_t e_sync = musaSuccess;
+  while (e_sync = musaStreamQuery(0)) {
+    if (e_sync == musaErrorNotReady) continue;
+    break;
+  }
+#endif
+}
 #else
 inline void SyncCUDAStream() {
 #if !defined(_WIN32)
-  cudaStreamSynchronize(nullptr);
+  cudaStreamSynchronize(0);
 #else
   cudaError_t e_sync = cudaSuccess;
   while (e_sync = cudaStreamQuery(0)) {
@@ -293,7 +305,7 @@ inline void SyncCUDAStream() {
 // https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
 
 template <>
-TEST_API void Copy<platform::CPUPlace, platform::CUDAPlace>(
+void Copy<platform::CPUPlace, platform::CUDAPlace>(
     platform::CPUPlace dst_place,
     void* dst,
     platform::CUDAPlace src_place,
@@ -314,6 +326,12 @@ TEST_API void Copy<platform::CPUPlace, platform::CUDAPlace>(
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -326,6 +344,8 @@ TEST_API void Copy<platform::CPUPlace, platform::CUDAPlace>(
         "GpuMemcpySync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -337,7 +357,7 @@ TEST_API void Copy<platform::CPUPlace, platform::CUDAPlace>(
 }
 
 template <>
-TEST_API void Copy<platform::CUDAPlace, platform::CPUPlace>(
+void Copy<platform::CUDAPlace, platform::CPUPlace>(
     platform::CUDAPlace dst_place,
     void* dst,
     platform::CPUPlace src_place,
@@ -358,6 +378,12 @@ TEST_API void Copy<platform::CUDAPlace, platform::CPUPlace>(
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -370,6 +396,8 @@ TEST_API void Copy<platform::CUDAPlace, platform::CPUPlace>(
         "GpuMemcpySync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -404,6 +432,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                num,
                                hipMemcpyDeviceToDevice,
                                reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+      platform::GpuMemcpyAsync(dst,
+                               src,
+                               num,
+                               musaMemcpyDeviceToDevice,
+                               reinterpret_cast<gpuStream_t>(stream));
 #else
       platform::GpuMemcpyAsync(dst,
                                src,
@@ -417,6 +451,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
                                          1);
 #ifdef PADDLE_WITH_HIP
       platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToDevice);
 #else
       platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
 #endif
@@ -456,7 +492,7 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
 }
 
 template <>
-TEST_API void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
+void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
     platform::CUDAPinnedPlace dst_place,
     void* dst,
     platform::CPUPlace src_place,
@@ -492,7 +528,7 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
   if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by stream(" << stream << ")";
+          << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned",
                                        platform::TracerEventType::UserDefined,
@@ -503,6 +539,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                              num,
                              hipMemcpyDeviceToHost,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyDeviceToHost,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -516,6 +558,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
                                        1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyDeviceToHost);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost);
 #endif
@@ -534,7 +578,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
 
   platform::SetDeviceId(dst_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by stream(" << stream << ")";
+          << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU",
                                        platform::TracerEventType::UserDefined,
@@ -545,6 +589,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                              num,
                              hipMemcpyHostToDevice,
                              reinterpret_cast<gpuStream_t>(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpyAsync(dst,
+                             src,
+                             num,
+                             musaMemcpyHostToDevice,
+                             reinterpret_cast<gpuStream_t>(stream));
 #else
     platform::GpuMemcpyAsync(dst,
                              src,
@@ -558,6 +608,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
                                        1);
 #ifdef PADDLE_WITH_HIP
     platform::GpuMemcpySync(dst, src, num, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    platform::GpuMemcpySync(dst, src, num, musaMemcpyHostToDevice);
 #else
     platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice);
 #endif
@@ -744,10 +796,11 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place,
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
           << dst_place;
   if (src_place.GetType() == phi::AllocationType::CPU &&
-      dst_place.GetType() == phi::AllocationType::CPU) {  // NOLINT
+      dst_place.GetType() == phi::AllocationType::CPU) {
     std::memcpy(dst, src, num);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   else if (src_place.GetType() == phi::AllocationType::CPU &&  // NOLINT
            dst_place.GetType() == phi::AllocationType::GPUPINNED) {
     std::memcpy(dst, src, num);
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index fe5fae7bafaebb..6754c17978ea31 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -102,7 +102,7 @@ op_library(quantize_linear_op DEPS phi common)
 op_library(save_combine_op DEPS string_array phi common)
 op_library(load_combine_op DEPS string_array)
 
-if (WITH_GPU OR WITH_ROCM)
+if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     register_cu_kernel(class_center_sample_op SRCS class_center_sample_op.cu DEPS ${OP_HEADER_DEPS})
 endif()
 
@@ -110,7 +110,7 @@ if (WITH_MKLDNN)
     register_mkldnn_kernel(layer_norm_op SRCS layer_norm_mkldnn_op.cc DEPS ${OP_HEADER_DEPS})
 endif()
 
-if (WITH_GPU OR WITH_ROCM)
+if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     op_library(activation_op SRCS activation_op.cc activation_op.kps soft_relu_op.cu DEPS ${OP_HEADER_DEPS})
 elseif (WITH_XPU_KP)
     op_library(activation_op SRCS activation_op.cc activation_op.kps DEPS ${OP_HEADER_DEPS})
@@ -118,9 +118,9 @@ else()
     op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS})
 endif()
 
-if (WITH_GPU OR WITH_ROCM)
+if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     op_library(sync_batch_norm_op DEPS processgroup_comm_utils)
-    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
+    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT WITH_MUSA) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
         op_library(sparse_attention_op DEPS processgroup_comm_utils)
     endif()
 endif()
@@ -152,10 +152,10 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} beam_search)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper ps_gpu_wrapper)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} processgroup_comm_utils)
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} process_group_nccl)
 endif()
-if (WITH_GPU OR WITH_ROCM)
+if (WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor)
 endif()
 if(WITH_XPU)
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index a07f311c6125ef..dcbe58ffceb6a1 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc
index 2c85ec6ea2076b..79e677034ce0f1 100644
--- a/paddle/fluid/operators/array_to_lod_tensor_op.cc
+++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc
@@ -55,7 +55,7 @@ struct ArrayToLoDFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(pool.Get(place)));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(pool.Get(place)));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 012edde57294a9..c25344994cb503 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <cfloat>
 #include <string>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/class_center_sample_op.cu b/paddle/fluid/operators/class_center_sample_op.cu
index ecfae25270f911..aa03c2b57355c6 100644
--- a/paddle/fluid/operators/class_center_sample_op.cu
+++ b/paddle/fluid/operators/class_center_sample_op.cu
@@ -19,6 +19,14 @@
 #include <hipcub/hipcub.hpp>
 typedef hiprandState curandState;
 namespace cub = hipcub;
+
+#elif defined(PADDLE_WITH_MUSA)
+#include <murand.h>
+#include <murand_kernel.h>
+
+#include <cub/cub.cuh>
+typedef murandState curandState;
+
 #else
 #include <curand.h>
 #include <curand_kernel.h>
@@ -34,7 +42,7 @@ namespace cub = hipcub;
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/tensor_utils.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -76,6 +84,11 @@ __global__ void RandomSampleClassCenter(const int64_t n,
   CUDA_KERNEL_LOOP(i, n) {
     buffer[i] = static_cast<T>(hiprand(&localState) % max_val);
   }
+#elif defined(PADDLE_WITH_MUSA)
+  murand_init(local_seed, id, increment, &localState);
+  CUDA_KERNEL_LOOP(i, n) {
+    buffer[i] = static_cast<T>(murand(&localState) % max_val);
+  }
 #else
   curand_init(local_seed, id, increment, &localState);
   CUDA_KERNEL_LOOP(i, n) {
@@ -352,7 +365,7 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
   phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device);
   T* num_classes_per_device_ptr = num_classes_per_device.data<T>();
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (nranks > 1) {
     auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
     if (map->has(ring_id)) {
@@ -397,15 +410,15 @@ void ClassCenterSampleKernel(const Context& dev_ctx,
 
       if (comm_ctx) {
         comm_ctx->AllReduce(
-            &num_classes_per_device, num_classes_per_device, ncclSum, stream);
+            &num_classes_per_device, num_classes_per_device, mcclSum, stream);
         paddle::platform::GpuStreamSync(stream);
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
             num_classes_per_device_ptr,
             num_classes_per_device_ptr,
             num_classes_per_device.numel(),
             phi::ToNCCLDataType(num_classes_per_device.dtype()),
-            ncclSum,
+            mcclSum,
             comm->comm(),
             stream));
       }
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index 1c8c8f00217cc5..fdecbca81fc590 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -30,7 +30,7 @@ register_operators(
   DEPS
   ${COLLECTIVE_DEPS})
 
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper phi
                       common)
   op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index 11b51602d4d75a..b554d658126f54 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/utils.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -33,12 +33,12 @@ template <typename T, typename DeviceContext>
 class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int send_numel = x->numel();
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
@@ -114,7 +114,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
       comm_ctx->GroupEnd();
       VLOG(3) << "new comm_context_manager has rid " << ring_id;
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
       for (auto i = 0; i < nranks; ++i) {
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
             send_buf + offset, send_numel, dtype, i, comm->comm(), stream));
@@ -122,7 +122,7 @@ class AllToAllOpCUDAKernel : public framework::OpKernel<T> {
             recv_buf + offset, send_numel, dtype, i, comm->comm(), stream));
         offset += send_numel;
       }
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
       VLOG(3) << "old NCCLCommContext has rid " << ring_id;
     }
 #else
diff --git a/paddle/fluid/operators/collective/barrier_op.cu.cc b/paddle/fluid/operators/collective/barrier_op.cu.cc
index 210c42d30f6d50..2b1f04a491d5e3 100644
--- a/paddle/fluid/operators/collective/barrier_op.cu.cc
+++ b/paddle/fluid/operators/collective/barrier_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/barrier_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -30,12 +30,12 @@ template <typename T, typename DeviceContext>
 class BarrierOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data();
@@ -62,7 +62,7 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
                             "NCCLCommContext is nullptr, collective op should "
                             "has ring_id attr."));
       auto stream = comm_ctx->GetStream();
-      ncclRedOp_t nccl_red_type = ncclSum;
+      mcclRedOp_t nccl_red_type = mcclSum;
       comm_ctx->AllReduce(out, *in, nccl_red_type, stream);
       platform::GpuStreamSync(stream);
       VLOG(3) << "new NCCLCommContext has rid " << rid;
@@ -70,8 +70,8 @@ class BarrierOpCUDAKernel : public framework::OpKernel<T> {
       auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
       // should ExecutionContext for calc stream.
       auto stream = ctx.cuda_device_context().stream();
-      ncclRedOp_t nccl_red_type = ncclSum;
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
+      mcclRedOp_t nccl_red_type = mcclSum;
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff,
                                                                   recvbuff,
                                                                   numel,
                                                                   dtype,
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index bd105c35886cb0..0de5e22aaabeb6 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -33,10 +33,10 @@ template <typename T, typename DeviceContext>
 class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -103,10 +103,10 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
       comm_ctx->AllGather(out, *in, stream);
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclAllGather(send_buff,
+          platform::dynload::mcclAllGather(send_buff,
                                            recv_buff,
                                            send_numel,
-                                           static_cast<ncclDataType_t>(dtype),
+                                           static_cast<mcclDataType_t>(dtype),
                                            comm->comm(),
                                            stream));
     }
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index 277988b56916f8..b45f568b835f8d 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
                           ALL_LAYOUT,
                           ops::CAllReduceMaxCUDAKernel,
                           float,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           double,
                           int,
                           int64_t,
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 9cd472f4217881..7bf5e59431f8ff 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -25,14 +25,14 @@ limitations under the License. */
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -309,13 +309,13 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
       }
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int rid = ctx.Attr<int>("ring_id");
 
     auto place = ctx.GetPlace();
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data<T>();
@@ -395,22 +395,22 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
              << ", dtype:" << dtype << ", comm:" << comm
              << ", stream:" << stream;
 
-    ncclRedOp_t nccl_red_type = ncclSum;
+    mcclRedOp_t nccl_red_type = mcclSum;
     switch (red_type) {
       case kRedSum:
-        nccl_red_type = ncclSum;
+        nccl_red_type = mcclSum;
         break;
 
       case kRedMax:
-        nccl_red_type = ncclMax;
+        nccl_red_type = mcclMax;
         break;
 
       case kRedMin:
-        nccl_red_type = ncclMin;
+        nccl_red_type = mcclMin;
         break;
 
       case kRedProd:
-        nccl_red_type = ncclProd;
+        nccl_red_type = mcclProd;
         break;
 
       default:
@@ -421,7 +421,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
     if (comm_ctx) {
       comm_ctx->AllReduce(out, *in, nccl_red_type, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(sendbuff,
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(sendbuff,
                                                                   recvbuff,
                                                                   numel,
                                                                   dtype,
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 76d809cd234f03..f886e4aaab212f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -28,9 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
                           ALL_LAYOUT,
                           ops::CAllReduceSumCUDAKernel,
                           float,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           double,
                           int,
                           int64_t,
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index 4d49bc4990c6ec..348c22bd8be48e 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -29,7 +29,7 @@ template <typename T, typename DeviceContext>
 class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
@@ -50,11 +50,11 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
     } else {
       // NOTE(liyurui): This will be removed after moving this operator to phi.
       int numel = x->numel();
-      ncclDataType_t dtype =
+      mcclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
       auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
       if (root == comm->rank()) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
             reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
             numel,
             dtype,
@@ -71,7 +71,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
               static_cast<phi::DenseTensor*>(out));
         }
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
             out->data<T>(), numel, dtype, root, comm->comm(), stream));
         VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
                 << common::product(out->dims());
@@ -100,8 +100,8 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast,
                           int64_t,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           plat::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
index 2dc9af01395468..2e84a0e80c2dcc 100644
--- a/paddle/fluid/operators/collective/c_comm_init_all_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 #include "paddle/fluid/platform/collective_helper.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
 
@@ -56,7 +56,7 @@ class CCommInitAllOp : public framework::OperatorBase {
     //                   platform::errors::PreconditionNotMet(
     //                       "CCommInitAllOp can run on gpu place only"));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     std::vector<int> devices = Attr<std::vector<int>>("devices");
     if (devices.empty()) {
       devices = platform::GetSelectedDevices();
diff --git a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
index 39d22fcd5f50d8..4d92c369abfebc 100644
--- a/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_multitrainer_op.cc
@@ -17,6 +17,10 @@ limitations under the License. */
 #if defined(PADDLE_WITH_RCCL)
 #include <rccl.h>
 #endif
+
+#if defined(PADDLE_WITH_MCCL)
+#include <mccl.h>
+#endif
 #include <stdint.h>
 
 #include <ostream>
@@ -28,7 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/threadpool.h"
 // #include "paddle/fluid/operators/distributed/distributed.h"
 // #include "paddle/fluid/operators/distributed/request_handler_impl.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -55,8 +59,8 @@ class CCommInitMultiTrainerOp : public framework::OperatorBase {
     auto var = scope.FindVar(Input("X"));
     PADDLE_ENFORCE_NOT_NULL(
         var, platform::errors::InvalidArgument("Input X must be provided."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    ncclUniqueId* nccl_id = var->GetMutable<ncclUniqueId>();
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+    mcclUniqueId* nccl_id = var->GetMutable<mcclUniqueId>();
 
     int ntrainers = Attr<int>("ntrainers");
     int train_id = Attr<int>("trainer_id");
diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc
index 086257eab60383..3f7683fb405cb1 100644
--- a/paddle/fluid/operators/collective/c_comm_init_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_op.cc
@@ -17,6 +17,11 @@ limitations under the License. */
 #if defined(PADDLE_WITH_RCCL)
 #include <rccl.h>
 #endif
+
+#if defined(PADDLE_WITH_MCCL)
+#include <mccl.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -24,12 +29,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/fluid/platform/collective_helper.h"
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -92,8 +97,8 @@ class CCommInitOp : public framework::OperatorBase {
 #endif
     } else {
 // TODO(wangxi): Put this in the unified header file
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-      using UniqueId = ncclUniqueId;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+      using UniqueId = mcclUniqueId;
       using CommContext = platform::NCCLCommContext;
 #elif defined(PADDLE_WITH_XPU_BKCL)
       using UniqueId = BKCLUniqueId;
@@ -109,7 +114,7 @@ class CCommInitOp : public framework::OperatorBase {
           platform::errors::PreconditionNotMet(
               "CCommInitOp can run on gpu or xpu place only."));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
       auto var = scope.FindVar(Input("X"));
       PADDLE_ENFORCE_NOT_NULL(
@@ -145,7 +150,7 @@ class CCommInitOp : public framework::OperatorBase {
         return;
       }
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
       VLOG(3) << "#### use old comm lab ####";
       UniqueId* comm_id = var->GetMutable<UniqueId>();
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index d13179cbae48b1..f170e07b6532f9 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/phi/api/include/tensor.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -38,7 +38,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -65,7 +65,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
                           rank,
                           nranks));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     phi::DenseTensor temp_out;
     framework::DDim temp_out_dims = x->dims();
     temp_out_dims[0] *= nranks;
@@ -130,10 +130,10 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
         comm_ctx->AllGather(&temp_out, *x, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::ncclAllGather(send_buff,
+            platform::dynload::mcclAllGather(send_buff,
                                              recv_buff,
                                              send_numel,
-                                             static_cast<ncclDataType_t>(dtype),
+                                             static_cast<mcclDataType_t>(dtype),
                                              comm->comm(),
                                              stream));
       }
@@ -175,8 +175,8 @@ PD_REGISTER_STRUCT_KERNEL(c_concat,
                           double,
                           int,
                           int64_t,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           plat::float16) {
 }
diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
index 4a07f7e98f793c..9851b9d9d9f685 100644
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -27,14 +27,14 @@ PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+static void GenNCCLID(std::vector<mcclUniqueId>* nccl_ids) {
   for (auto& nccl_id : *nccl_ids) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id));
   }
 }
 
-static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
+static void CopyNCCLIDToVar(const std::vector<mcclUniqueId>& nccl_ids,
                             std::function<std::string(size_t)> func,
                             const framework::Scope& scope) {
   for (size_t i = 0; i < nccl_ids.size(); ++i) {
@@ -44,8 +44,8 @@ static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
         var,
         platform::errors::NotFound("Variable with name %s is not found",
                                    var_name.c_str()));
-    auto nccl_id = var->GetMutable<ncclUniqueId>();
-    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
+    auto nccl_id = var->GetMutable<mcclUniqueId>();
+    memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId));
   }
 }
 
@@ -68,7 +68,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
 
     std::string endpoint = Attr<std::string>("endpoint");
 
-    std::vector<ncclUniqueId> nccl_ids;
+    std::vector<mcclUniqueId> nccl_ids;
     nccl_ids.resize(1);
 
     if (!FLAGS_dynamic_static_unified_comm) {
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index 20884d1ae8a969..d41500038745b9 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -26,14 +26,14 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -236,12 +236,12 @@ template <ReduceType red_type, typename T>
 class CReduceOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
     auto place = ctx.GetPlace();
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
     int64_t numel = in->numel();
     const void* sendbuff = in->data();
@@ -286,22 +286,22 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
       stream = ctx.cuda_device_context().stream();
     }
 
-    ncclRedOp_t nccl_red_type = ncclSum;
+    mcclRedOp_t nccl_red_type = mcclSum;
     switch (red_type) {
       case kRedSum:
-        nccl_red_type = ncclSum;
+        nccl_red_type = mcclSum;
         break;
 
       case kRedMax:
-        nccl_red_type = ncclMax;
+        nccl_red_type = mcclMax;
         break;
 
       case kRedMin:
-        nccl_red_type = ncclMin;
+        nccl_red_type = mcclMin;
         break;
 
       case kRedProd:
-        nccl_red_type = ncclProd;
+        nccl_red_type = mcclProd;
         break;
 
       default:
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index cd1cf0c0176363..a39ecdfe847eb8 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -30,7 +30,7 @@ template <typename T, typename DeviceContext>
 class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
 
@@ -105,14 +105,14 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
     if (comm_ctx) {
-      comm_ctx->ReduceScatter(out, *in, ncclSum, stream);
+      comm_ctx->ReduceScatter(out, *in, mcclSum, stream);
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
           send_buff,
           recv_buff,
           recv_numel,
-          static_cast<ncclDataType_t>(dtype),
-          ncclSum,
+          static_cast<mcclDataType_t>(dtype),
+          mcclSum,
           comm->comm(),
           stream));
     }
@@ -135,9 +135,9 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter,
                           ops::CReduceScatterOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/c_scatter_op.cu.cc b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
index 7f4b4f6734de0c..86bb602256aefb 100644
--- a/paddle/fluid/operators/collective/c_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_scatter_op.cu.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_scatter_op.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -30,11 +30,11 @@ template <typename T, typename DeviceContext>
 class CScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int numel = x->numel();
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -123,7 +123,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
       }
     } else {
       if (root_id == comm->rank()) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
             reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
             numel,
             dtype,
@@ -137,7 +137,7 @@ class CScatterOpCUDAKernel : public framework::OpKernel<T> {
             *platform::DeviceContextPool::Instance().Get(place),
             static_cast<phi::DenseTensor*>(&temp));
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
             out_ptr, numel, dtype, root_id, comm->comm(), stream));
       }
     }
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index f8f43d5c9da48c..7ea80d8a54e9ad 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/softmax_impl.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
@@ -208,17 +208,17 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
         eigen_logits.maximum(along_axis);
 
     if (comm_ctx) {
-      comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
+      comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream);
     } else {
       void* logits_max_buff = logits_max.mutable_data<T>(place);
 
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
           logits_max_buff,
           logits_max_buff,
           logits_max.numel(),
           platform::ToNCCLDataType(
               framework::TransToProtoVarType(logits_max.dtype())),
-          ncclMax,
+          mcclMax,
           comm->comm(),
           stream));
     }
@@ -273,16 +273,16 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
 
     predicted_logits.mutable_data<T>(place);
     if (comm_ctx) {
-      comm_ctx->AllReduce(&predicted_logits, predicted_logits, ncclSum, stream);
+      comm_ctx->AllReduce(&predicted_logits, predicted_logits, mcclSum, stream);
     } else {
       void* predict_logits_buff = predicted_logits.data();
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
           predict_logits_buff,
           predict_logits_buff,
           predicted_logits.numel(),
           platform::ToNCCLDataType(
               framework::TransToProtoVarType(predicted_logits.dtype())),
-          ncclSum,
+          mcclSum,
           comm->comm(),
           stream));
     }
@@ -301,16 +301,16 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
         eigen_softmax.sum(along_axis);
 
     if (comm_ctx) {
-      comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
+      comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream);
     } else {
       void* sum_exp_logits_buff = sum_exp_logits.data();
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
           sum_exp_logits_buff,
           sum_exp_logits_buff,
           sum_exp_logits.numel(),
           platform::ToNCCLDataType(
               framework::TransToProtoVarType(sum_exp_logits.dtype())),
-          ncclSum,
+          mcclSum,
           comm->comm(),
           stream));
     }
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
index e100397924af56..79c32bc907045f 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h
@@ -39,7 +39,7 @@ template <typename T, typename DeviceContext>
 class CSyncCalcStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32)
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && !defined(_WIN32)
 
     auto place = ctx.GetPlace();
     auto dev_ctx = static_cast<phi::GPUContext*>(
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
index 8d60d633272a98..52f4e6f6d88fee 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.h
@@ -18,14 +18,14 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #elif defined(PADDLE_WITH_XPU_BKCL)
@@ -40,7 +40,7 @@ template <typename T, typename DeviceContext>
 class CSyncCommStreamKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto place = ctx.GetPlace();
     int ring_id = ctx.Attr<int>("ring_id");
 
diff --git a/paddle/fluid/operators/collective/c_wait_comm_op.cc b/paddle/fluid/operators/collective/c_wait_comm_op.cc
index f2eab0532b9df2..c97da1a737b0f2 100644
--- a/paddle/fluid/operators/collective/c_wait_comm_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_comm_op.cc
@@ -19,7 +19,7 @@ namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -47,7 +47,7 @@ class CWaitCommOp : public framework::OperatorBase {
             "wait_comm op can run on gpu place only for now, but got %s",
             place.DebugString()));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     int ring_id = Attr<int>("ring_id");
 
     gpuStream_t compute_stream =
@@ -89,6 +89,9 @@ class CWaitCommOp : public framework::OperatorBase {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, comm_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(compute_stream, event, 0));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, comm_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
diff --git a/paddle/fluid/operators/collective/c_wait_compute_op.cc b/paddle/fluid/operators/collective/c_wait_compute_op.cc
index 33b56cbe6581d0..3088e1ed61d66e 100644
--- a/paddle/fluid/operators/collective/c_wait_compute_op.cc
+++ b/paddle/fluid/operators/collective/c_wait_compute_op.cc
@@ -19,7 +19,7 @@ namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -47,7 +47,7 @@ class CWaitComputeOp : public framework::OperatorBase {
             "wait_compute op can run on gpu place only for now, but got %s",
             place.DebugString()));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     int ring_id = Attr<int>("ring_id");
 
     gpuStream_t compute_stream =
@@ -89,6 +89,9 @@ class CWaitComputeOp : public framework::OperatorBase {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+#elif defined(PADDLE_WITH_MUSA)    
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event, compute_stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(comm_stream, event, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event, compute_stream));
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
index 1d03cb151e4a01..da13a5ba800a63 100644
--- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc
@@ -34,14 +34,14 @@ class Scope;
 namespace paddle {
 namespace operators {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-static void GenNCCLID(std::vector<ncclUniqueId>* nccl_ids) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+static void GenNCCLID(std::vector<mcclUniqueId>* nccl_ids) {
   for (auto& nccl_id : *nccl_ids) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetUniqueId(&nccl_id));
   }
 }
 
-static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
+static void CopyNCCLIDToVar(const std::vector<mcclUniqueId>& nccl_ids,
                             std::function<std::string(size_t)> func,
                             const framework::Scope& scope) {
   for (size_t i = 0; i < nccl_ids.size(); ++i) {
@@ -51,8 +51,8 @@ static void CopyNCCLIDToVar(const std::vector<ncclUniqueId>& nccl_ids,
         var,
         platform::errors::NotFound("Variable with name %s is not found",
                                    var_name.c_str()));
-    auto nccl_id = var->GetMutable<ncclUniqueId>();
-    memcpy(nccl_id, &nccl_ids[i], sizeof(ncclUniqueId));
+    auto nccl_id = var->GetMutable<mcclUniqueId>();
+    memcpy(nccl_id, &nccl_ids[i], sizeof(mcclUniqueId));
   }
 }
 
@@ -130,7 +130,7 @@ class GenNCCLIdOp : public framework::OperatorBase {
             << ", trainers:" << ss.str();
 
     int server_fd = -1;
-    std::vector<ncclUniqueId> nccl_ids;
+    std::vector<mcclUniqueId> nccl_ids;
     nccl_ids.resize(nccl_comm_num);
 
     /// 1. init flat
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 7a9c02628088fd..2c9abe54dcd39e 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/global_gather_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -31,8 +31,8 @@ namespace operators {
 template <typename T>
 struct GlobalGatherFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+// #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -73,7 +73,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
       cpu_global_count_data = cpu_global_count.data<int64_t>();
     }
 
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
@@ -165,7 +165,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
       auto send_buf = x->data<T>();
       auto recv_buf = out->data<T>();
       for (auto i = 0; i < n_expert; ++i) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
         for (auto j = 0; j < nranks; ++j) {
           int idx = i + j * n_expert;
           if (cpu_global_count_data[idx]) {
@@ -188,13 +188,13 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
                 stream));
           }
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
       }
     }
-#else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-#endif
+// #else
+    // PADDLE_THROW(
+        // platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+// #endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
@@ -205,8 +205,8 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
 template <typename T>
 struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+// #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -304,14 +304,16 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
 
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
 
-#else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-#endif
+// #else
+//     PADDLE_THROW(
+//         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+// #endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 6b915d35be0430..4331f6f818a58e 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/convert_utils.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,8 +32,8 @@ namespace operators {
 template <typename T>
 struct GlobalScatterFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+// #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -72,7 +72,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
       global_count_len = cpu_global_count.numel();
     }
 
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
     int ring_id = ctx.Attr<int>("ring_id");
@@ -173,7 +173,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
       auto recv_buf = out->data<T>();
 
       for (auto i = 0; i < n_expert; ++i) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupStart());
         for (auto j = 0; j < nranks; ++j) {
           int idx = i + j * n_expert;
           if (cpu_local_count_data[idx]) {
@@ -196,14 +196,14 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
             recv_ptr += cpu_global_count_data[idx];
           }
         }
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGroupEnd());
       }
     }
 
-#else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-#endif
+// #else
+//     PADDLE_THROW(
+//         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+// #endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
@@ -214,8 +214,8 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
 template <typename T>
 struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
   void operator()(const framework::ExecutionContext& ctx) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+// #if NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     auto local_count = ctx.Input<phi::DenseTensor>("local_count");
     auto global_count = ctx.Input<phi::DenseTensor>("global_count");
@@ -311,14 +311,16 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
 
 #ifdef PADDLE_WITH_CUDA
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
 
-#else
-    PADDLE_THROW(
-        platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-#endif
+// #else
+//     PADDLE_THROW(
+//         platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
+// #endif
 #else
     PADDLE_THROW(
         platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
index b4773a8eb54562..d53a92369df401 100644
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
@@ -31,8 +31,8 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
                           double,
                           int,
                           int64_t,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           plat::float16) {
 }
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index b0cdabce48503a..863850b6e38396 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_allgather_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,11 +32,11 @@ template <typename T, typename DeviceContext>
 class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     auto in = ctx.Input<phi::DenseTensor>("X");
     auto out = ctx.Output<phi::DenseTensor>("Out");
     int64_t numel = in->numel();
-    ncclDataType_t dtype =
+    mcclDataType_t dtype =
         platform::ToNCCLDataType(framework::TransToProtoVarType(in->dtype()));
 
     int nranks = ctx.Attr<int>("nranks");
@@ -128,10 +128,10 @@ class PartialAllGatherOpCUDAKernel : public framework::OpKernel<T> {
         const T* send_buff = in->data<T>() + offset;
         T* recv_buff = out->data<T>();
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::ncclAllGather(send_buff,
+            platform::dynload::mcclAllGather(send_buff,
                                              recv_buff,
                                              send_numel,
-                                             static_cast<ncclDataType_t>(dtype),
+                                             static_cast<mcclDataType_t>(dtype),
                                              comm->comm(),
                                              stream));
       }
@@ -155,9 +155,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather,
                           ops::PartialAllGatherOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index c8844058696e14..fbe9e8b84da48c 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_recv_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,8 +32,8 @@ template <typename T, typename DeviceContext>
 class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) 
+    // NCCL_VERSION_CODE >= 2703
     auto out = ctx.Output<phi::DenseTensor>("Out");
     auto out_dims = out->dims();
     auto numel = out->numel();
@@ -142,7 +142,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
                             peer,
                             nranks));
 
-      ncclDataType_t dtype = platform::ToNCCLDataType(type);
+      mcclDataType_t dtype = platform::ToNCCLDataType(type);
 
       if (comm_ctx) {
         auto recv_buf = distributed::GetPartialTensor(*out, offset, recv_numel);
@@ -180,9 +180,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv,
                           ops::PartialRecvOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index 39858b3ed37a26..cc1dda5715a8e9 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/partial_send_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -32,8 +32,8 @@ template <typename T, typename DeviceContext>
 class PartialSendCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
+    // NCCL_VERSION_CODE >= 2703
     auto x = ctx.Input<phi::DenseTensor>("X");
     int numel = x->numel();
     int rid = ctx.Attr<int>("ring_id");
@@ -136,7 +136,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
                             peer,
                             nranks));
 
-      ncclDataType_t dtype =
+      mcclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
 
       if (comm_ctx) {
@@ -176,9 +176,9 @@ PD_REGISTER_STRUCT_KERNEL(partial_send,
                           ops::PartialSendCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           int,
                           int64_t,
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 41c2e70df8c35f..4bfd579efb7b4e 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
@@ -29,8 +29,7 @@ PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
 framework::DDim recv_shape_info(const platform::Place &place,
                                 const gpuStream_t &stream,
                                 platform::NCCLComm *comm,
@@ -47,7 +46,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
   }
 
   phi::DataType shape_dtype = phi::DataType::INT32;
-  ncclDataType_t nccl_dtype =
+  mcclDataType_t nccl_dtype =
       platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype));
 
   // step1: recv the shape size
@@ -124,8 +123,7 @@ template <typename T, typename DeviceContext>
 class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
     int rid = ctx.Attr<int>("ring_id");
     bool dynamic_shape = ctx.Attr<bool>("dynamic_shape");
     PADDLE_ENFORCE_GE(
@@ -216,7 +214,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     int data_type = ctx.Attr<int>("dtype");
     framework::proto::VarType::Type type =
         framework::proto::VarType::Type(data_type);
-    ncclDataType_t dtype = platform::ToNCCLDataType(type);
+    mcclDataType_t dtype = platform::ToNCCLDataType(type);
 
     auto *out_var = ctx.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensorArray>()) {
@@ -299,9 +297,9 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2,
                           ops::RecvOpV2CUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           int,
                           int64_t,
                           int8_t,
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 86be6908e3cd28..5c07522ab45e65 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/send_v2_op.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
@@ -28,8 +28,7 @@ PHI_DECLARE_bool(dynamic_static_unified_comm);
 namespace paddle {
 namespace operators {
 
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)) 
 void send_shape_info(const phi::DenseTensor& x,
                      const platform::Place& place,
                      const gpuStream_t& stream,
@@ -46,7 +45,7 @@ void send_shape_info(const phi::DenseTensor& x,
             "to send the shape info."));
   }
   phi::DataType shape_dtype = phi::DataType::INT32;
-  ncclDataType_t nccl_dtype =
+  mcclDataType_t nccl_dtype =
       platform::ToNCCLDataType(framework::TransToProtoVarType(shape_dtype));
   auto dims = x.dims();
   int shape_size = dims.size();
@@ -122,8 +121,7 @@ template <typename T, typename DeviceContext>
 class SendOpV2CUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
     int rid = ctx.Attr<int>("ring_id");
     bool dynamic_shape = ctx.Attr<bool>("dynamic_shape");
     PADDLE_ENFORCE_GE(
@@ -217,7 +215,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         VLOG(3) << "LodTensorArray: idx(" << idx << ")";
         auto& x = x_array.at(idx);
         int numel = x.numel();
-        ncclDataType_t dtype =
+        mcclDataType_t dtype =
             platform::ToNCCLDataType(framework::TransToProtoVarType(x.dtype()));
         if (comm_ctx) {
           comm_ctx->Send(x, numel, peer, stream);
@@ -247,7 +245,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     if (comm_ctx) {
       comm_ctx->Send(*x, numel, peer, stream);
     } else {
-      ncclDataType_t dtype =
+      mcclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
       PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
           x->data<T>(), numel, dtype, peer, comm->comm(), stream));
@@ -274,9 +272,9 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
                           ops::SendOpV2CUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
-#endif
+// #endif
                           int,
                           int64_t,
                           int8_t,
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index 0f04a295ed263f..d5419d2b13a4e0 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -77,7 +77,7 @@ class ConditionalOp : public framework::OperatorBase {
                           ips[0]->numel()));
     bool res = false;
     if (platform::is_gpu_place(ips[0]->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::DenseTensor cpu_tensor;
       framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor);
       platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait();
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 94b946e43dc7a1..b44be01ca1a8e2 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -222,7 +222,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     paddle::operators::FeedSparseCooTensorKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     feed_sparse_coo_tensor,
     GPU,
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 9262ca59af970b..3fb50e695d1a36 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -26,7 +26,7 @@ namespace imperative {
 class OpBase;
 }  // namespace imperative
 }  // namespace paddle
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -34,7 +34,7 @@ namespace paddle {
 namespace operators {
 
 static size_t CUDADevCount() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   return platform::GetGPUDeviceCount();
 #else
   return 0UL;
diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc
index 8ddce0da7faacc..ef0dccff7197f0 100644
--- a/paddle/fluid/operators/controlflow/while_op_helper.cc
+++ b/paddle/fluid/operators/controlflow/while_op_helper.cc
@@ -227,7 +227,7 @@ bool GetCondData(const phi::DenseTensor &cond) {
   // when platform::is_gpu_place(cond.place()) or
   // platform::is_xpu_place(cond.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_cond{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   framework::TensorCopySync(cond, platform::CPUPlace(), cpu_cond.get());
 #else
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index 509c067e24e421..da1eec366937d8 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/phi/core/distributed/comm_context_manager.h"
@@ -216,7 +216,7 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
         d_batch_square_sum);
 
     if (need_sync_stats) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       int rid = 0;
       platform::NCCLComm *comm = nullptr;
       const auto &comm_context_manager =
@@ -247,59 +247,59 @@ class DataNormGradKernel<T, phi::GPUContext> : public framework::OpKernel<T> {
       }
 
       if (comm_ctx) {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
             reinterpret_cast<const void *>(d_batch_size),
             reinterpret_cast<void *>(d_batch_size),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            ncclSum,
+            mcclSum,
             comm_ctx->GetNcclComm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
             reinterpret_cast<const void *>(d_batch_sum),
             reinterpret_cast<void *>(d_batch_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            ncclSum,
+            mcclSum,
             comm_ctx->GetNcclComm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
             reinterpret_cast<const void *>(d_batch_square_sum),
             reinterpret_cast<void *>(d_batch_square_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            ncclSum,
+            mcclSum,
             comm_ctx->GetNcclComm(),
             stream));
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
             reinterpret_cast<const void *>(d_batch_size),
             reinterpret_cast<void *>(d_batch_size),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            ncclSum,
+            mcclSum,
             comm->comm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
             reinterpret_cast<const void *>(d_batch_sum),
             reinterpret_cast<void *>(d_batch_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            ncclSum,
+            mcclSum,
             comm->comm(),
             stream));
-        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
             reinterpret_cast<const void *>(d_batch_square_sum),
             reinterpret_cast<void *>(d_batch_square_sum),
             C,
             platform::ToNCCLDataType(
                 framework::TransToProtoVarType(x->dtype())),
-            ncclSum,
+            mcclSum,
             comm->comm(),
             stream));
       }
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index d38a72556f7596..688178ac7b5825 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -11,7 +11,7 @@ function(detection_library TARGET_NAME)
   set(srcs)
   # filter cuda source file when not build with cuda/rocm
   foreach(src ${detection_library_SRCS})
-    if(NOT WITH_GPU AND NOT WITH_ROCM)
+    if(NOT WITH_GPU AND NOT WITH_ROCM  AND NOT WITH_MUSA)
       if(${src} MATCHES ".*\\.cc$")
         list(APPEND srcs ${src})
       endif()
@@ -57,7 +57,7 @@ detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
 detection_library(retinanet_detection_output_op SRCS
                   retinanet_detection_output_op.cc)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   set(TMPDEPS memory)
   if(WITH_GPU)
     if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index adb60a8a8d0642..945678dfd96acd 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <cfloat>
 #include <string>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index b2bbd9c82095c8..6f203e9cca7379 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index face0f758f8484..1b2dc157fb4022 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -32,11 +32,14 @@ limitations under the License. */
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/cpu/elementwise_grad.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #ifdef __NVCC__
 #include <cuda.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
+#elif defined(__MUSACC__)
+#include <musa.h>
+#include <musa_runtime.h>
 #endif
 #include <thrust/iterator/iterator_adaptor.h>
 
@@ -311,7 +314,7 @@ static void FusedElemwiseAndActBroadcast2CPU(const T *x,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T,
           typename CompoundFunctor,
           bool BcastY,
@@ -516,7 +519,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     int h = pre;
     int w = n;
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActBroadcast1CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -551,7 +554,7 @@ void FusedElemwiseAndActComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActBroadcast2CUDA<T,
                                         CompoundFunctor,
                                         BcastY,
@@ -880,7 +883,7 @@ static void FusedElemwiseAndActGradBroadcast2CPU(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 template <typename T,
           typename DX_OP,
           typename DY_OP,
@@ -1273,7 +1276,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     int w = n;
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActGradBroadcast1CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1324,7 +1327,7 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
     }
   } else {
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       FusedElemwiseAndActGradBroadcast2CUDA<T,
                                             DX_OP,
                                             DY_OP,
@@ -1594,7 +1597,7 @@ static inline std::vector<int> GetReduceDim(const framework::DDim &in,
   return phi::funcs::GetReduceDim(in, out, axis);
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename T, typename Functor>
 void GetGradXAndYOut(const phi::GPUContext &dev_ctx,
diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc
index 4c2dd992657812..8be70c6fc8e933 100644
--- a/paddle/fluid/operators/expand_op.cc
+++ b/paddle/fluid/operators/expand_op.cc
@@ -283,7 +283,7 @@ REGISTER_OP_CPU_KERNEL(expand_grad,
                        ops::ExpandGradKernel<phi::CPUContext, double>,
                        ops::ExpandGradKernel<phi::CPUContext, int>,
                        ops::ExpandGradKernel<phi::CPUContext, int64_t>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL(
     expand,
     ops::ExpandKernel<phi::GPUContext, float>,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index bdf8a80debb649..976ce30d2f0be9 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -193,6 +193,8 @@ struct FindChannelAbsMaxFunctor<phi::GPUContext, T> {
 
 #ifdef PADDLE_WITH_HIP
       hipMemset(out_abs_max, 0, sizeof(T) * cout);
+#elif defined(PADDLE_WITH_MUSA)      
+      musaMemset(out_abs_max, 0, sizeof(T) * cout);
 #else
       cudaMemset(out_abs_max, 0, sizeof(T) * cout);
 #endif  // PADDLE_FLUID_OPERATORS_FAKE_QUANTIZE_OP_CU_H_
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index ced20a0108a527..942dd94f4dca22 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -32,16 +32,16 @@ if(WITH_XPU)
   op_library(fused_feedforward_op)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   # fused_bn_activation_op needs cudnn 7.4.1 above
   # HIP not support bn act fuse in MIOPEN
-  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
+  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
     op_library(fused_bn_activation_op)
   endif()
   # HIP not support cudnnTransformTensor
   # fusion_conv_inception_op needs cudnn 7 above
   # HIP not support cudnnConvolutionBiasActivationForward
-  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
+  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100))
     op_library(fusion_conv_inception_op)
   endif()
   op_library(yolo_box_head_op)
@@ -53,12 +53,12 @@ if(WITH_GPU OR WITH_ROCM)
   endif()
   # fused_bn_add_activation
   # HIP not support bn act fuse in MIOPEN
-  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
+  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7401))
     op_library(fused_bn_add_activation_op)
   endif()
   # fused_dropout
   # only support CUDA
-  if(NOT WITH_ROCM)
+  if(NOT WITH_ROCM AND NOT WITH_MUSA)
     op_library(fused_feedforward_op)
     # fused_attention_op
     op_library(fused_attention_op)
@@ -66,7 +66,7 @@ if(WITH_GPU OR WITH_ROCM)
     op_library(fused_multi_transformer_int8_op)
   endif()
   # resnet_unit needs cudnn 8.0 above
-  if((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
+  if((NOT WITH_ROCM AND NOT WITH_MUSA) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000))
     op_library(resnet_unit_op)
   endif()
 
diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h
index 8ea1e11cd29f41..6b3e435529e715 100644
--- a/paddle/fluid/operators/fused/attn_bias_add.cu.h
+++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/fused/fused_attention_utils.h b/paddle/fluid/operators/fused/fused_attention_utils.h
index b198c4a5792912..c37b6e2307b585 100644
--- a/paddle/fluid/operators/fused/fused_attention_utils.h
+++ b/paddle/fluid/operators/fused/fused_attention_utils.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -34,7 +34,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int ring_id,
                       const phi::GPUContext &dev_ctx) {
   if (ring_id == -1) return;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
 
   if (map->has(ring_id)) {
@@ -86,10 +86,10 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
       VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
     }
     if (comm_ctx) {
-      comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream);
+      comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
-          sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
+          sendbuff, recvbuff, numel, dtype, mcclSum, comm->comm(), stream));
     }
   }
 #else
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index ccd099109487c9..7081180ea67667 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <cooperative_groups.h>
-#include <cuda.h>
-#include <curand_kernel.h>
+#include <musa.h>
+#include <murand_kernel.h>
 
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
index 40717402846db5..ad73be604fddb2 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <cuda_fp16.h>
+#include <musa_fp16.h>
 #include <float.h>
 
 #include <cub/cub.cuh>
@@ -39,7 +39,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/fusion/gpu/attn_gemm.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -61,7 +61,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
                       const int count,
                       const phi::GPUContext &ctx) {
   if (ring_id == -1) return;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
 
   if (map->has(ring_id)) {
@@ -117,10 +117,10 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
       VLOG(3) << "old NCCLCommContext has ring_id " << ring_id;
     }
     if (comm_ctx) {
-      comm_ctx->AllReduce(&tensor, tensor, ncclSum, stream);
+      comm_ctx->AllReduce(&tensor, tensor, mcclSum, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-          sendbuff, recvbuff, count, dtype, ncclSum, comm->comm(), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclAllReduce(
+          sendbuff, recvbuff, count, dtype, mcclSum, comm->comm(), stream));
     }
   }
 #else
diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
index 362860aa23bdf7..e78579a27c1a94 100644
--- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
+++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu
@@ -150,6 +150,34 @@ void FusedSeqpoolCVM(const framework::ExecutionContext
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
+#elif defined(PADDLE_WITH_MUSA)         
+  T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_input_values,
+                           input_data.data(),
+                           input_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  T **gpu_output_values =
+      reinterpret_cast<T **>(&gpu_input_values[input_data.size()]);
+  platform::GpuMemcpyAsync(gpu_output_values,
+                           output_data.data(),
+                           output_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  T **gpu_seqpool_output_values =
+      reinterpret_cast<T **>(&gpu_output_values[output_data.size()]);
+  platform::GpuMemcpyAsync(gpu_seqpool_output_values,
+                           seqpool_output_data.data(),
+                           seqpool_output_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+  size_t **lods_values = reinterpret_cast<size_t **>(
+      &gpu_seqpool_output_values[seqpool_output_data.size()]);
+  platform::GpuMemcpyAsync(lods_values,
+                           lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           musaMemcpyHostToDevice,
+                           stream);                  
 #else
   T **gpu_input_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_input_values,
@@ -356,6 +384,37 @@ void FusedSeqpoolCVMGrad(const framework::ExecutionContext &ctx,
                            lods.size() * sizeof(size_t *),
                            hipMemcpyHostToDevice,
                            stream);
+#elif defined(PADDLE_WITH_MUSA)
+  T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
+  platform::GpuMemcpyAsync(gpu_out_grads_values,
+                           out_grads_data.data(),
+                           out_grads_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  T **gpu_in_grads_values =
+      reinterpret_cast<T **>(&gpu_out_grads_values[out_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_in_grads_values,
+                           in_grads_data.data(),
+                           in_grads_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  T **gpu_cvm_values =
+      reinterpret_cast<T **>(&gpu_in_grads_values[in_grads_data.size()]);
+  platform::GpuMemcpyAsync(gpu_cvm_values,
+                           cvm_data.data(),
+                           cvm_data.size() * sizeof(T *),
+                           musaMemcpyHostToDevice,
+                           stream);
+
+  size_t **lods_values =
+      reinterpret_cast<size_t **>(&gpu_cvm_values[cvm_data.size()]);
+  platform::GpuMemcpyAsync(lods_values,
+                           lods.data(),
+                           lods.size() * sizeof(size_t *),
+                           musaMemcpyHostToDevice,
+                           stream);                           
 #else
   T **gpu_out_grads_values = reinterpret_cast<T **>(temp_ptr->ptr());
   platform::GpuMemcpyAsync(gpu_out_grads_values,
diff --git a/paddle/fluid/operators/fused/yolo_box_post_op.cu b/paddle/fluid/operators/fused/yolo_box_post_op.cu
index 72bb97a2aae9ee..c6fe13548033ac 100644
--- a/paddle/fluid/operators/fused/yolo_box_post_op.cu
+++ b/paddle/fluid/operators/fused/yolo_box_post_op.cu
@@ -255,6 +255,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      bbox_count_device_ptr, &bbox_count, sizeof(int), musaMemcpyHostToDevice);      
 #else
   cudaMemcpy(
       bbox_count_device_ptr, &bbox_count, sizeof(int), cudaMemcpyHostToDevice);
@@ -268,6 +271,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      &bbox_count, bbox_count_device_ptr, sizeof(int), musaMemcpyDeviceToHost);  
 #else
   cudaMemcpy(
       &bbox_count, bbox_count_device_ptr, sizeof(int), cudaMemcpyDeviceToHost);
@@ -283,6 +289,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
     hipFree(bbox_tensor);
     hipMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
+#elif defined(PADDLE_WITH_MUSA)
+    musaFree(bbox_tensor);
+    musaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));    
 #else
     cudaFree(bbox_tensor);
     cudaMalloc(&bbox_tensor, bbox_count * (5 + class_num) * sizeof(float));
@@ -296,6 +305,9 @@ static void YoloTensorParseCuda(
 #ifdef PADDLE_WITH_HIP
   hipMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(
+      bbox_index_device_ptr, &bbox_index, sizeof(int), musaMemcpyHostToDevice);
 #else
   cudaMemcpy(
       bbox_index_device_ptr, &bbox_index, sizeof(int), cudaMemcpyHostToDevice);
@@ -356,6 +368,13 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
               anchors.data(),
               anchors.size() * sizeof(int),
               hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)      
+    musaMalloc(reinterpret_cast<void**>(&device_anchors),
+               anchors.size() * sizeof(int));
+    musaMemcpy(device_anchors,
+               anchors.data(),
+               anchors.size() * sizeof(int),
+               musaMemcpyHostToDevice);        
 #else
     cudaMalloc(reinterpret_cast<void**>(&device_anchors),
                anchors.size() * sizeof(int));
@@ -388,6 +407,10 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
       hipMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
           ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMalloc(
+          reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
+          ts_info[i].bbox_count_max_alloc * (5 + class_num) * sizeof(float));
 #else
       cudaMalloc(
           reinterpret_cast<void**>(&ts_info[i].bboxes_dev_ptr),
@@ -398,6 +421,9 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       hipMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                 sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
+                 sizeof(int));
 #else
       cudaMalloc(reinterpret_cast<void**>(&ts_info[i].bbox_count_device_ptr),
                  sizeof(int));
@@ -409,6 +435,8 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
     int* bbox_index_device_ptr;
 #ifdef PADDLE_WITH_HIP
     hipMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #else
     cudaMalloc(reinterpret_cast<void**>(&bbox_index_device_ptr), sizeof(int));
 #endif
@@ -456,6 +484,12 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
             ts_info[ts_id].bboxes_dev_ptr,
             ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
             hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+        musaMemcpyAsync(
+            ts_info[ts_id].bboxes_host_ptr,
+            ts_info[ts_id].bboxes_dev_ptr,
+            ts_info[ts_id].bbox_count_host * (5 + class_num) * sizeof(float),
+            musaMemcpyDeviceToHost);            
 #else
         cudaMemcpyAsync(
             ts_info[ts_id].bboxes_host_ptr,
@@ -534,6 +568,8 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
     hipFree(bbox_index_device_ptr);
+#elif defined(PADDLE_WITH_MUSA)
+    musaFree(bbox_index_device_ptr);
 #else
     cudaFree(bbox_index_device_ptr);
 #endif
@@ -541,6 +577,9 @@ class YoloBoxPostKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_HIP
       hipFree(ts_info[i].bboxes_dev_ptr);
       hipFree(ts_info[i].bbox_count_device_ptr);
+#elif defined(PADDLE_WITH_MUSA)
+      musaFree(ts_info[i].bboxes_dev_ptr);
+      musaFree(ts_info[i].bbox_count_device_ptr);
 #else
       cudaFree(ts_info[i].bboxes_dev_ptr);
       cudaFree(ts_info[i].bbox_count_device_ptr);
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index 8ae92b04b7df44..c6a8a4fe7b9822 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -111,7 +111,7 @@ PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           int,
                           int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(get_tensor_from_selected_rows,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index b4e0f511f6d61b..b45fdd9619a61d 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -32,6 +32,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
+#include <murand_kernel.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -95,6 +98,12 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
                threadIdx.y * WARP_SIZE + threadIdx.x,
                0,
                &rng);
+#elif defined(PADDLE_WITH_MUSA)  
+  murandState rng;
+  murand_init(rand_seed * gridDim.x + blockIdx.x,
+               threadIdx.y * WARP_SIZE + threadIdx.x,
+               0,
+               &rng);
 #else
   curandState rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -128,6 +137,8 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
       for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
index c88d36602bd79c..3530beda000b4e 100644
--- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 // HIP not support cudnnSpatialTfGridGeneratorForward
 
 #include "paddle/fluid/framework/op_registry.h"
diff --git a/paddle/fluid/operators/hinge_loss_op.cc b/paddle/fluid/operators/hinge_loss_op.cc
index dea3ce3fe695b8..ea38db87e63e7d 100644
--- a/paddle/fluid/operators/hinge_loss_op.cc
+++ b/paddle/fluid/operators/hinge_loss_op.cc
@@ -156,7 +156,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss_grad, CPU, ALL_LAYOUT, ops::HingeLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     hinge_loss, GPU, ALL_LAYOUT, ops::HingeLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc
index 8c123bb8a32f22..e1e9ca5ef66673 100644
--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -201,7 +201,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence_grad, CPU, ALL_LAYOUT, ops::Im2SequenceGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     im2sequence, GPU, ALL_LAYOUT, ops::Im2SequenceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
index 5352ccc99df92e..5c03b7395a4f24 100644
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -67,7 +67,7 @@ bool TensorIsfinite(const phi::DenseTensor& tensor);
 FiniteVisitor(Isnan, Any, CPU);
 FiniteVisitor(Isinf, Any, CPU);
 FiniteVisitor(Isfinite, All, CPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 FiniteVisitor(Isnan, Any, GPU);
 FiniteVisitor(Isinf, Any, GPU);
 FiniteVisitor(Isfinite, All, GPU);
@@ -82,7 +82,7 @@ inline void TensorContainsNAN(const phi::DenseTensor& tensor,
                         IsnanVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsnanVisitorGPU(tensor, out));
@@ -99,7 +99,7 @@ inline void TensorContainsInf(const phi::DenseTensor& tensor,
                         IsinfVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsinfVisitorGPU(tensor, out));
@@ -116,7 +116,7 @@ inline void TensorIsfinite(const phi::DenseTensor& tensor,
                         IsfiniteVisitorCPU(tensor, out));
     return;
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place)) {
     VisitDataTypeNormal(paddle::framework::TransToProtoVarType(tensor.dtype()),
                         IsfiniteVisitorGPU(tensor, out));
diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc
index 8f0b705c8de79f..3918ba54599808 100644
--- a/paddle/fluid/operators/l1_norm_op.cc
+++ b/paddle/fluid/operators/l1_norm_op.cc
@@ -96,7 +96,7 @@ PD_REGISTER_STRUCT_KERNEL(l1_norm, CPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, CPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(l1_norm, GPU, ALL_LAYOUT, ops::L1NormKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     l1_norm_grad, GPU, ALL_LAYOUT, ops::L1NormGradKernel, float) {}
diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc
index dd85ccff87f2d2..197aaa74bb3e13 100644
--- a/paddle/fluid/operators/load_op.cc
+++ b/paddle/fluid/operators/load_op.cc
@@ -133,7 +133,7 @@ PD_REGISTER_KERNEL(load, CPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, CPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(load, GPU, ALL_LAYOUT, ops::LoadKernel, float) {}
 PD_REGISTER_KERNEL(
     load_sr, GPU, ALL_LAYOUT, ops::LoadSelectedRowsKernel, float) {}
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 94b03197291174..da8ea875e93938 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -66,7 +66,7 @@ struct LoDTensorToArrayFunctor {
     if (std::is_same<Place, platform::CPUPlace>::value) {
       Apply(static_cast<phi::CPUContext *>(dev_ctx));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       Apply(static_cast<phi::GPUContext *>(dev_ctx));
 #else
       PADDLE_THROW(
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index edd8b20da160c5..3f0ccf3bf40ffb 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -221,6 +221,9 @@ struct LookupTableV2GradCUDAFunctor {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));      
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream()));
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 75ef56accb10b4..216e9863a5e277 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -16,6 +16,8 @@
 #ifdef PADDLE_WITH_HIP
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
+#elif defined(PADDLE_WITH_MUSA)
+
 #else
 #include <cub/cub.cuh>
 #endif
@@ -36,7 +38,7 @@ namespace cub = hipcub;
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -72,7 +74,7 @@ void GetClassInterval(const gpuStream_t& stream,
     return;
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   DenseTensor num_classes_per_device;
   phi::TensorFromVector(shard_dim_vec, dev_ctx, &num_classes_per_device);
   int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
@@ -123,15 +125,15 @@ void GetClassInterval(const gpuStream_t& stream,
     if (comm_ctx) {
       comm_ctx->AllReduce(&num_classes_per_device,
                           num_classes_per_device,
-                          ncclSum,
+                          mcclSum,
                           calcu_stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
           num_classes_per_device_ptr,
           num_classes_per_device_ptr,
           num_classes_per_device.numel(),
           phi::ToNCCLDataType(num_classes_per_device.dtype()),
-          ncclSum,
+          mcclSum,
           comm->comm(),
           calcu_stream));
     }
@@ -270,7 +272,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
                               DenseTensor* loss) {
   const auto& place = dev_ctx.GetPlace();  // old code
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   paddle::platform::NCCLComm* comm = nullptr;
   const auto& comm_context_manager =
       phi::distributed::CommContextManager::GetInstance();
@@ -405,7 +407,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
           phi::kps::IdentityFunctor<T>(),
           {1});
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (nranks > 1) {
     if (pg) {
       std::vector<phi::DenseTensor> in_tensor;
@@ -419,14 +421,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       task->Wait();
     } else {
       if (comm_ctx) {
-        comm_ctx->AllReduce(&logits_max, logits_max, ncclMax, stream);
+        comm_ctx->AllReduce(&logits_max, logits_max, mcclMax, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::ncclAllReduce(logits_max_buff,
+            phi::dynload::mcclAllReduce(logits_max_buff,
                                         logits_max_buff,
                                         logits_max.numel(),
                                         phi::ToNCCLDataType(logits_max.dtype()),
-                                        ncclMax,
+                                        mcclMax,
                                         comm->comm(),
                                         stream));
       }
@@ -450,7 +452,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       phi::kps::ExpFunctor<T>(),
       {1});
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (nranks > 1) {
     if (pg) {
       std::vector<phi::DenseTensor> in_tensor;
@@ -464,14 +466,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       task->Wait();
     } else {
       if (comm_ctx) {
-        comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, ncclSum, stream);
+        comm_ctx->AllReduce(&sum_exp_logits, sum_exp_logits, mcclSum, stream);
       } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
             sum_exp_logits_buff,
             sum_exp_logits_buff,
             sum_exp_logits.numel(),
             phi::ToNCCLDataType(sum_exp_logits.dtype()),
-            ncclSum,
+            mcclSum,
             comm->comm(),
             stream));
       }
@@ -512,7 +514,7 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
                                                    class_interval.data<int>());
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   if (nranks > 1) {
     if (pg) {
       std::vector<phi::DenseTensor> in_tensor;
@@ -526,14 +528,14 @@ void MarginCrossEntropyKernel(const Context& dev_ctx,
       task->Wait();
     } else {
       if (comm_ctx) {
-        comm_ctx->AllReduce(loss, *loss, ncclSum, stream);
+        comm_ctx->AllReduce(loss, *loss, mcclSum, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::ncclAllReduce(loss_ptr,
+            phi::dynload::mcclAllReduce(loss_ptr,
                                         loss_ptr,
                                         loss->numel(),
                                         phi::ToNCCLDataType(loss->dtype()),
-                                        ncclSum,
+                                        mcclSum,
                                         comm->comm(),
                                         stream));
       }
diff --git a/paddle/fluid/operators/math/bert_encoder_functor.h b/paddle/fluid/operators/math/bert_encoder_functor.h
index 76e27380b90e21..d1e0a772f3eaa6 100644
--- a/paddle/fluid/operators/math/bert_encoder_functor.h
+++ b/paddle/fluid/operators/math/bert_encoder_functor.h
@@ -20,6 +20,12 @@ limitations under the License. */
 
 #include <cub/cub.cuh>  // NOLINT
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 
@@ -47,7 +53,7 @@ struct CUDATypeTraits<float> {
   typedef float TYPE;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // This functor involves a fusion calculation in Ernie or Bert.
 // The fusion mode is as follows:
 //
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index 857d870847ee8c..2b0d3432720dfa 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -28,7 +28,7 @@ struct GRUUnitFunctor<phi::CPUContext, T> {
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
@@ -92,7 +92,7 @@ struct GRUUnitGradFunctor<phi::CPUContext, T> {
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(),
                                 value,
                                 grad,
@@ -182,7 +182,7 @@ struct GRUUnitFunctorV2<phi::CPUContext, T> {
                       int batch_size,
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(CblasNoTrans,
@@ -234,7 +234,7 @@ struct GRUUnitGradFunctorV2<phi::CPUContext, T> {
                       int batch_size,
                       const detail::ActivationType active_node,
                       const detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     // calculate grad_update_gate, grad_frame_state,
     // grad_reset_output, grad_reset_gate
     detail::cpu_gru_backward(context,
diff --git a/paddle/fluid/operators/math/inclusive_scan.h b/paddle/fluid/operators/math/inclusive_scan.h
index 3032b78a2029d0..792a08423be0ac 100644
--- a/paddle/fluid/operators/math/inclusive_scan.h
+++ b/paddle/fluid/operators/math/inclusive_scan.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index 00ff1fbcbc38db..1762353abaa9f2 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class PreluChannelWiseDirectCUDAFunctor {
  public:
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index bf028c4ada3695..87fe1ee33f0f15 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -160,6 +160,11 @@ void GPUSampleWithProb<T>::operator()(const phi::GPUContext& context,
                                        s_data,
                                        sizeof(int64_t) * num_samples,
                                        hipMemcpyHostToDevice));
+#elif defined(PADDLE_WITH_MUSA)   
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(samples_data + num_true,
+                                       s_data,
+                                       sizeof(int64_t) * num_samples,
+                                       musaMemcpyHostToDevice));                                    
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(samples_data + num_true,
                                         s_data,
diff --git a/paddle/fluid/operators/math/sample_prob.h b/paddle/fluid/operators/math/sample_prob.h
index 524ba826a57047..da8c22aa67bbb3 100644
--- a/paddle/fluid/operators/math/sample_prob.h
+++ b/paddle/fluid/operators/math/sample_prob.h
@@ -106,7 +106,7 @@ class SampleWithProb {
   }
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class GPUSampleWithProb {
  public:
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 895a427bae6e20..f082189fa0f370 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -98,7 +98,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) {
 
   int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   head_number = context.Attr<int>("head_number");
 #endif
 
@@ -112,7 +112,7 @@ ComputeMatmulImpl(const framework::ExecutionContext &context) {
     }
   }
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_);
 
   if (head_number > 1) {
@@ -271,7 +271,7 @@ class MatMulGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP)  && !defined(PADDLE_WITH_MUSA)
     if (context.HasAttr("head_number")) {
       head_number = context.Attr<int>("head_number");
     }
@@ -403,7 +403,7 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
 
     int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     head_number = context.Attr<int>("head_number");
 #endif
 
@@ -645,7 +645,7 @@ class MatMulOp : public framework::OperatorWithKernel {
     }
     int64_t dim_out_y = mat_dim_y.width_;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     int head_number = context->Attrs().Get<int>("head_number");
     bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_);
     if (context->IsRuntime()) {
@@ -788,7 +788,7 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra();
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
     AddAttr<int>("head_number", "The number of heads of the matrix")
         .SetDefault(1);
 #endif
diff --git a/paddle/fluid/operators/memcpy_h2d_op.h b/paddle/fluid/operators/memcpy_h2d_op.h
index 5f480461d77cdb..a4b6e061bfdff0 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.h
+++ b/paddle/fluid/operators/memcpy_h2d_op.h
@@ -39,7 +39,7 @@ class MemcpyH2DFunctor {
 
   void operator()(const phi::DenseTensor &lod_tensor) const {
     auto &out_tensor = *out_->GetMutable<phi::DenseTensor>();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto stream = static_cast<const phi::GPUContext *>(&dev_ctx_)->stream();
 #else
     auto stream = nullptr;
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index 3ed27460e16b6c..935b93d1c3ae31 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -68,7 +68,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 64bc176d971492..580ea2da8721cd 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -157,6 +157,6 @@ REGISTER_OPERATOR(minus,
                   ops::MinusGradMaker);
 PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
 #endif
diff --git a/paddle/fluid/operators/nccl/CMakeLists.txt b/paddle/fluid/operators/nccl/CMakeLists.txt
index 629b41b4b582b7..2d079c8ef521d6 100644
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT (WITH_NCCL OR WITH_RCCL))
+if(NOT (WITH_NCCL OR WITH_RCCL OR WITH_MCCL))
   return()
 endif()
 
@@ -16,7 +16,14 @@ if(WITH_ROCM AND NOT WIN32)
     DEPS device_context operator)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_MUSA AND NOT WIN32)
+  musa_library(
+    nccl_common
+    SRCS nccl_gpu_common.cc
+    DEPS device_context operator)
+endif()
+
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   op_library(nccl_op DEPS nccl_common)
   set(OPERATOR_DEPS
       ${OPERATOR_DEPS} nccl_common
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
index 9f7d967a84708e..4916d71b2f73a0 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc
@@ -18,7 +18,7 @@ namespace paddle {
 namespace platform {
 namespace {
 // TODO(panyx0718): Where to destroy them.
-std::unique_ptr<std::vector<ncclComm_t>> global_comms;
+std::unique_ptr<std::vector<mcclComm_t>> global_comms;
 std::unique_ptr<std::unordered_map<int, int>> comm_id_map;
 bool inited = false;
 size_t last_num_gpus = -1;
@@ -41,21 +41,21 @@ void Communicator::InitAll(const std::vector<int>& gpus) {
   if (global_comms) {
     for (size_t i = 0; i < global_comms->size(); ++i) {
       // FIXME(dzh) : PADDLE_ENFORCE return void
-      dynload::ncclCommDestroy((*global_comms)[i]);
+      dynload::mcclCommDestroy((*global_comms)[i]);
     }
   }
-  global_comms = std::make_unique<std::vector<ncclComm_t>>();
+  global_comms = std::make_unique<std::vector<mcclComm_t>>();
   comm_id_map = std::make_unique<std::unordered_map<int, int>>();
   global_comms->resize(gpus.size());
   for (size_t i = 0; i < gpus.size(); ++i) {
     (*comm_id_map)[gpus[i]] = i;
   }
   PADDLE_ENFORCE_GPU_SUCCESS(
-      dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
+      dynload::mcclCommInitAll(global_comms->data(), gpus.size(), gpus.data()));
   inited = true;
 }
 
-const std::vector<ncclComm_t>& Communicator::comms() const {
+const std::vector<mcclComm_t>& Communicator::comms() const {
   std::lock_guard<std::mutex> guard(comm_mu);
   return *global_comms;
 }
diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.h b/paddle/fluid/operators/nccl/nccl_gpu_common.h
index 01905d8ca84b3b..0427180d56c04f 100644
--- a/paddle/fluid/operators/nccl/nccl_gpu_common.h
+++ b/paddle/fluid/operators/nccl/nccl_gpu_common.h
@@ -25,6 +25,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
+#elif defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/platform/dynload/mccl.h"
 #else
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
@@ -42,7 +44,7 @@ struct Communicator {
 
   void InitAll(const std::vector<int>& gpus);
 
-  const std::vector<ncclComm_t>& comms() const;
+  const std::vector<mcclComm_t>& comms() const;
 };
 
 }  // namespace platform
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index 8b06aa653c070f..7e9b2b1d4dd19f 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -105,8 +105,8 @@ class NCCLAllReduceOp : public framework::OperatorWithKernel {
 
     std::string reduction = ctx->Attrs().Get<std::string>("reduction");
     PADDLE_ENFORCE_EQ(
-        (reduction == "ncclSum" || reduction == "ncclProd" ||
-         reduction == "ncclMin" || reduction == "ncclMax"),
+        (reduction == "mcclSum" || reduction == "mcclProd" ||
+         reduction == "mcclMin" || reduction == "mcclMax"),
         true,
         platform::errors::InvalidArgument("invalid nccl reduction."));
 
@@ -124,9 +124,9 @@ class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of AllReduce op");
     AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
+                         "(string, default 'mcclSum') "
+                         "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.")
+        .SetDefault("mcclSum");
     AddComment(R"DOC(
 NCCLAllReduce Operator.
 
@@ -151,8 +151,8 @@ class NCCLReduceOp : public framework::OperatorWithKernel {
 
     std::string reduction = ctx->Attrs().Get<std::string>("reduction");
     PADDLE_ENFORCE_EQ(
-        (reduction == "ncclSum" || reduction == "ncclProd" ||
-         reduction == "ncclMin" || reduction == "ncclMax"),
+        (reduction == "mcclSum" || reduction == "mcclProd" ||
+         reduction == "mcclMin" || reduction == "mcclMax"),
         true,
         platform::errors::InvalidArgument("invalid nccl reduction."));
 
@@ -170,9 +170,9 @@ class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Communicator", "Communicator for communicating between gpus");
     AddOutput("Out", "The output of Reduce op");
     AddAttr<std::string>("reduction",
-                         "(string, default 'ncclSum') "
-                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
-        .SetDefault("ncclSum");
+                         "(string, default 'mcclSum') "
+                         "{'mcclMin', 'mcclMax', 'mcclProd', 'mcclSum'}.")
+        .SetDefault("mcclSum");
     AddAttr<int>("root",
                  "(int, default kInvalidGPUId) "
                  "Root gpu of the parameter. If not, "
@@ -246,10 +246,10 @@ REGISTER_OPERATOR(
     ops::NCCLInitOpVarTypeInference,
     ops::NCCLInitOpShapeInference);
 
-REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce,
+REGISTER_OP_WITHOUT_GRADIENT(mcclAllReduce,
                              ops::NCCLAllReduceOp,
                              ops::NCCLAllReduceOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(ncclBcast,
+REGISTER_OP_WITHOUT_GRADIENT(mcclBcast,
                              ops::NCCLBcastOp,
                              ops::NCCLBcastOpMaker);
 REGISTER_OP_WITHOUT_GRADIENT(ncclReduce,
diff --git a/paddle/fluid/operators/nccl/nccl_op.cu.cc b/paddle/fluid/operators/nccl/nccl_op.cu.cc
index abb24cc8cae10d..7b99c47cf13c88 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cu.cc
@@ -27,33 +27,33 @@ class NCCLTypeWrapper;
 template <>
 class NCCLTypeWrapper<float> {
  public:
-  static const ncclDataType_t type = ncclFloat;
+  static const mcclDataType_t type = mcclFloat;
 };
 
 template <>
 class NCCLTypeWrapper<double> {
  public:
-  static const ncclDataType_t type = ncclDouble;
+  static const mcclDataType_t type = mcclDouble;
 };
 
-static ncclRedOp_t str_to_nccl_red_type(std::string reduction) {
-  static const std::unordered_map<std::string, ncclRedOp_t> str_to_type = {
-      {"ncclSum", ncclSum},
-      {"ncclMin", ncclMin},
-      {"ncclMax", ncclMax},
-      {"ncclProd", ncclProd},
+static mcclRedOp_t str_to_nccl_red_type(std::string reduction) {
+  static const std::unordered_map<std::string, mcclRedOp_t> str_to_type = {
+      {"mcclSum", mcclSum},
+      {"mcclMin", mcclMin},
+      {"mcclMax", mcclMax},
+      {"mcclProd", mcclProd},
   };
   auto it = str_to_type.find(reduction);
   PADDLE_ENFORCE_EQ(it != str_to_type.end(),
                     true,
                     platform::errors::InvalidArgument(
-                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
-                        "ncclProd | ncclSum"));
+                        "Invalid nccl reduction. Must be mcclMin | mcclMax | "
+                        "mcclProd | mcclSum"));
   return it->second;
 }
 
 template <typename T, typename DeviceContext>
-class NCCLAllReduceKernel : public framework::OpKernel<T> {
+class mcclAllReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
@@ -74,7 +74,7 @@ class NCCLAllReduceKernel : public framework::OpKernel<T> {
             << " invoke allreduce. send " << x->numel() << " recv "
             << out->numel();
     PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclAllReduce(x->data<T>(),
+        platform::dynload::mcclAllReduce(x->data<T>(),
                                          out->mutable_data<T>(ctx.GetPlace()),
                                          out->numel(),
                                          NCCLTypeWrapper<T>::type,
@@ -115,7 +115,7 @@ class NCCLReduceKernel : public framework::OpKernel<T> {
     VLOG(3) << "gpu : " << gpu_id << " invoke reduce. send " << x->numel()
             << " recv " << out->numel();
     PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::ncclReduce(x->data<T>(),
+        platform::dynload::mcclReduce(x->data<T>(),
                                       recvbuffer,
                                       x->numel(),
                                       NCCLTypeWrapper<T>::type,
@@ -144,7 +144,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
     if (idx == root) {
       auto* x = ctx.Input<phi::DenseTensor>("X");
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. send " << x->numel();
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclBcast(
           reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
           x->numel(),
           NCCLTypeWrapper<T>::type,
@@ -157,7 +157,7 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
       VLOG(3) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
               << common::product(out->dims());
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclBcast(out->mutable_data<T>(ctx.GetPlace()),
+          platform::dynload::mcclBcast(out->mutable_data<T>(ctx.GetPlace()),
                                        out->numel(),
                                        NCCLTypeWrapper<T>::type,
                                        root,
@@ -173,8 +173,8 @@ class NCCLBcastKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 PD_REGISTER_STRUCT_KERNEL(
-    ncclAllReduce, GPU, ALL_LAYOUT, ops::NCCLAllReduceKernel, float) {}
+    mcclAllReduce, GPU, ALL_LAYOUT, ops::mcclAllReduceKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
-    ncclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
+    mcclBcast, GPU, ALL_LAYOUT, ops::NCCLBcastKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
     ncclReduce, GPU, ALL_LAYOUT, ops::NCCLReduceKernel, float) {}
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 6b0a36fc564721..8290da165800b5 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -30,13 +30,13 @@
 #include "paddle/phi/kernels/funcs/tensor_to_string.h"
 #include "paddle/utils/optional.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #include "math.h"  // NOLINT
 #endif
@@ -74,6 +74,8 @@ static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) {
   static_assert(!std::is_same<T, void>::value, "T cannot be void.");
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(x, 0, n * sizeof(T), stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream));
 #endif
@@ -271,6 +273,10 @@ static bool IsFinite(const phi::GPUContext &dev_ctx, const float *ptr) {
   PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
       &cpu_value, ptr, sizeof(float), hipMemcpyDeviceToHost, stream));
   PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
+      &cpu_value, ptr, sizeof(float), musaMemcpyDeviceToHost, stream));
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
       &cpu_value, ptr, sizeof(float), cudaMemcpyDeviceToHost, stream));
@@ -895,14 +901,14 @@ static void MultiTensorUpdateLambParamAndBetaPows(
 #undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 static bool CreatePreMulScaleOpIfSupported(
-    ncclDataType_t dtype,
-    ncclComm_t comm,
+    mcclDataType_t dtype,
+    mcclComm_t comm,
     const void *scale,
-    ncclRedOp_t *op,
+    mcclRedOp_t *op,
     distributed::NCCLCommContext *comm_ctx = nullptr) {
-#if NCCL_VERSION_CODE >= 21100
+// #if NCCL_VERSION_CODE >= 21100
   if (FLAGS_dynamic_static_unified_comm) {
     PADDLE_ENFORCE_NOT_NULL(
         comm_ctx,
@@ -913,32 +919,32 @@ static bool CreatePreMulScaleOpIfSupported(
             "But parameter of comm_ctx should not be nullptr."));
     int ver = comm_ctx->GetNcclVersion();
     if (ver >= 21100) {
-      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
+      VLOG(10) << "mcclRedOpCreatePreMulSum is supported.";
       comm_ctx->RedOpCreatePreMulSum(
-          op, const_cast<void *>(scale), dtype, ncclScalarDevice);
+          op, const_cast<void *>(scale), dtype, mcclScalarDevice);
       return true;
     }
   } else {
     int ver;
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetVersion(&ver));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetVersion(&ver));
     if (ver >= 21100) {
-      VLOG(10) << "ncclRedOpCreatePreMulSum is supported.";
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
-          op, const_cast<void *>(scale), dtype, ncclScalarDevice, comm));
+      VLOG(10) << "mcclRedOpCreatePreMulSum is supported.";
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum(
+          op, const_cast<void *>(scale), dtype, mcclScalarDevice, comm));
       return true;
     }
   }
-#endif
-  VLOG(10) << "ncclRedOpCreatePreMulSum is not supported.";
+// #endif
+  VLOG(10) << "mcclRedOpCreatePreMulSum is not supported.";
   return false;
 }
 
 static void DestoryOpIfSupported(
-    ncclRedOp_t op,
-    ncclComm_t comm,
+    mcclRedOp_t op,
+    mcclComm_t comm,
     distributed::NCCLCommContext *comm_ctx = nullptr) {
-#if NCCL_VERSION_CODE >= 21100
-  VLOG(10) << "ncclRedOpDestroy starts";
+// #if NCCL_VERSION_CODE >= 21100
+  VLOG(10) << "mcclRedOpDestroy starts";
 
   if (FLAGS_dynamic_static_unified_comm) {
     PADDLE_ENFORCE_NOT_NULL(
@@ -950,12 +956,12 @@ static void DestoryOpIfSupported(
             "But parameter of comm_ctx should not be nullptr."));
     comm_ctx->RedOpDestroy(op);
   } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, comm));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, comm));
   }
-  VLOG(10) << "ncclRedOpDestroy ends";
+  VLOG(10) << "mcclRedOpDestroy ends";
 
-#endif
-  VLOG(10) << "ncclRedOpDestroy is not supported.";
+// #endif
+  VLOG(10) << "mcclRedOpDestroy is not supported.";
 }
 
 template <typename T1, typename T2>
@@ -980,11 +986,11 @@ static void LaunchScaleKernel(const phi::GPUContext &dev_ctx,
 }
 
 template <typename T, bool UseReduceScatter>
-static void NCCLSumWithScaleBase(const T *sendbuff,
+static void mcclSumWithScaleBase(const T *sendbuff,
                                  T *recvbuff,
                                  size_t recvcount,
                                  size_t nranks,
-                                 ncclComm_t comm,
+                                 mcclComm_t comm,
                                  gpuStream_t stream,
                                  const phi::GPUContext &dev_ctx,
                                  distributed::NCCLCommContext *comm_ctx,
@@ -1016,9 +1022,9 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
     return;
   }
 
-  ncclRedOp_t op = ncclSum;
-  ncclDataType_t dtype =
-      std::is_same<T, float>::value ? ncclFloat32 : ncclFloat16;
+  mcclRedOp_t op = mcclSum;
+  mcclDataType_t dtype =
+      std::is_same<T, float>::value ? mcclFloat32 : mcclFloat16;
   bool should_destroy_op = scale && CreatePreMulScaleOpIfSupported(
                                         dtype, comm, scale, &op, comm_ctx);
   memory_utils::Buffer buffer(dev_ctx.GetPlace());
@@ -1034,7 +1040,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
       // TODO(BeingGod): NCCLCommContext::ReduceScatter only accept DenseTensor,
       // but sendbuff or recvbuff maybe allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclReduceScatter(sendbuff,
+          phi::dynload::mcclReduceScatter(sendbuff,
                                           recvbuff,
                                           recvcount,
                                           dtype,
@@ -1045,7 +1051,7 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but sendbuff or recvbuff maybe allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(sendbuff,
+          phi::dynload::mcclAllReduce(sendbuff,
                                       recvbuff,
                                       recvcount,
                                       dtype,
@@ -1055,10 +1061,10 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
     }
   } else {
     if (UseReduceScatter) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduceScatter(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduceScatter(
           sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(
           sendbuff, recvbuff, recvcount, dtype, op, comm, stream));
     }
   }
@@ -1069,16 +1075,16 @@ static void NCCLSumWithScaleBase(const T *sendbuff,
 }
 
 template <typename T>
-static void NCCLReduceScatterWithScale(const T *sendbuff,
+static void mcclReduceScatterWithScale(const T *sendbuff,
                                        T *recvbuff,
                                        size_t recvcount,
                                        size_t nranks,
-                                       ncclComm_t comm,
+                                       mcclComm_t comm,
                                        gpuStream_t stream,
                                        const phi::GPUContext &dev_ctx,
                                        distributed::NCCLCommContext *comm_ctx,
                                        const T *scale = nullptr) {
-  NCCLSumWithScaleBase<T, true>(sendbuff,
+  mcclSumWithScaleBase<T, true>(sendbuff,
                                 recvbuff,
                                 recvcount,
                                 nranks,
@@ -1090,16 +1096,16 @@ static void NCCLReduceScatterWithScale(const T *sendbuff,
 }
 
 template <typename T>
-static void NCCLAllReduceWithScale(const T *sendbuff,
+static void mcclAllReduceWithScale(const T *sendbuff,
                                    T *recvbuff,
                                    size_t recvcount,
                                    size_t nranks,
-                                   ncclComm_t comm,
+                                   mcclComm_t comm,
                                    gpuStream_t stream,
                                    const phi::GPUContext &dev_ctx,
                                    distributed::NCCLCommContext *comm_ctx,
                                    const T *scale = nullptr) {
-  NCCLSumWithScaleBase<T, false>(sendbuff,
+  mcclSumWithScaleBase<T, false>(sendbuff,
                                  recvbuff,
                                  recvcount,
                                  nranks,
@@ -1240,6 +1246,10 @@ static std::string GetMinMaxStr(const T *x, size_t n, const phi::Place &place) {
     PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), hipMemcpyDeviceToHost, stream));
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(
+        &ret_cpu[0], ret, 2 * sizeof(T), musaMemcpyDeviceToHost, stream));
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));    
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(
         &ret_cpu[0], ret, 2 * sizeof(T), cudaMemcpyDeviceToHost, stream));
@@ -1296,6 +1306,12 @@ static bool HasNanInf(const phi::GPUContext &dev_ctx, const T *x, int numel) {
                                             sizeof(flag),
                                             hipMemcpyDeviceToHost,
                                             dev_ctx.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(&flag,
+                                             out.Get<bool>(),
+                                             sizeof(flag),
+                                             musaMemcpyDeviceToHost,
+                                             dev_ctx.stream()));                                            
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&flag,
                                              out.Get<bool>(),
@@ -1458,7 +1474,7 @@ void DistributedFusedLambKernel(
     DenseTensor *acc_step,
     DenseTensor *stop_update,
     DenseTensor *step) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto stream = dev_ctx.stream();
   auto place = dev_ctx.GetPlace();
   found_inf->Resize({1});
@@ -1756,7 +1772,7 @@ void DistributedFusedLambKernel(
 
   // Step 6: allreduce + global norm gradient clip
   int64_t global_rank = 0, local_rank = 0;
-  ncclComm_t global_comm = nullptr, local_comm = nullptr,
+  mcclComm_t global_comm = nullptr, local_comm = nullptr,
              external_comm = nullptr;
   paddle::platform::NCCLComm *nccl_comm_handle = nullptr,
                              *local_nccl_comm_handle = nullptr;
@@ -1868,7 +1884,7 @@ void DistributedFusedLambKernel(
       // (1) ReduceScater first
       if (local_shard) {
         if (use_hierarchical_allreduce) {
-          NCCLReduceScatterWithScale(
+          mcclReduceScatterWithScale(
               fp32_grad_data,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -1877,7 +1893,7 @@ void DistributedFusedLambKernel(
               stream,
               dev_ctx,
               local_comm_ctx);
-          NCCLAllReduceWithScale(
+          mcclAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -1887,7 +1903,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               external_comm_ctx);
 
-          NCCLReduceScatterWithScale(
+          mcclReduceScatterWithScale(
               fp16_grad_data,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -1896,7 +1912,7 @@ void DistributedFusedLambKernel(
               stream,
               dev_ctx,
               local_comm_ctx);
-          NCCLAllReduceWithScale(
+          mcclAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -1906,7 +1922,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               external_comm_ctx);
         } else {
-          NCCLAllReduceWithScale(fp32_grad_data,
+          mcclAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
                                  fp32_numel,
                                  nranks,
@@ -1914,7 +1930,7 @@ void DistributedFusedLambKernel(
                                  stream,
                                  dev_ctx,
                                  comm_ctx);
-          NCCLAllReduceWithScale(fp16_grad_data,
+          mcclAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel,
                                  nranks,
@@ -1926,7 +1942,7 @@ void DistributedFusedLambKernel(
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
         fp16_sum_grad += (local_rank * fp16_numel_each_device);
       } else {
-        NCCLReduceScatterWithScale(fp32_grad_data,
+        mcclReduceScatterWithScale(fp32_grad_data,
                                    fp32_sum_grad,
                                    fp32_numel_each_device,
                                    nranks,
@@ -1934,7 +1950,7 @@ void DistributedFusedLambKernel(
                                    stream,
                                    dev_ctx,
                                    comm_ctx);
-        NCCLReduceScatterWithScale(fp16_grad_data,
+        mcclReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
                                    fp16_numel_each_device,
                                    nranks,
@@ -1957,11 +1973,11 @@ void DistributedFusedLambKernel(
         // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
         // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::ncclAllReduce(fp32_square_grad_norm,
+            phi::dynload::mcclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
                                         1,
-                                        ncclFloat32,
-                                        ncclSum,
+                                        mcclFloat32,
+                                        mcclSum,
                                         local_comm,
                                         stream));
       }
@@ -2014,7 +2030,7 @@ void DistributedFusedLambKernel(
               << HasNanInf(dev_ctx, fp16_grad_data, fp16_numel);
       if (local_shard) {
         if (use_hierarchical_allreduce) {
-          NCCLReduceScatterWithScale(
+          mcclReduceScatterWithScale(
               fp32_grad_data,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -2024,7 +2040,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               local_comm_ctx,
               fp32_scale);
-          NCCLAllReduceWithScale(
+          mcclAllReduceWithScale(
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_sum_grad + local_rank * fp32_numel_each_device,
               fp32_numel_each_device,
@@ -2033,7 +2049,7 @@ void DistributedFusedLambKernel(
               stream,
               dev_ctx,
               external_comm_ctx);
-          NCCLReduceScatterWithScale(
+          mcclReduceScatterWithScale(
               fp16_grad_data,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -2043,7 +2059,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               local_comm_ctx,
               fp16_scale);
-          NCCLAllReduceWithScale(
+          mcclAllReduceWithScale(
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_sum_grad + local_rank * fp16_numel_each_device,
               fp16_numel_each_device,
@@ -2053,7 +2069,7 @@ void DistributedFusedLambKernel(
               dev_ctx,
               external_comm_ctx);
         } else {
-          NCCLAllReduceWithScale(fp32_grad_data,
+          mcclAllReduceWithScale(fp32_grad_data,
                                  fp32_sum_grad,
                                  fp32_numel,
                                  nranks,
@@ -2062,7 +2078,7 @@ void DistributedFusedLambKernel(
                                  dev_ctx,
                                  comm_ctx,
                                  fp32_scale);
-          NCCLAllReduceWithScale(fp16_grad_data,
+          mcclAllReduceWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel,
                                  nranks,
@@ -2075,7 +2091,7 @@ void DistributedFusedLambKernel(
         fp32_sum_grad += (local_rank * fp32_numel_each_device);
         fp16_sum_grad += (local_rank * fp16_numel_each_device);
       } else {
-        NCCLReduceScatterWithScale(fp32_grad_data,
+        mcclReduceScatterWithScale(fp32_grad_data,
                                    fp32_sum_grad,
                                    fp32_numel_each_device,
                                    nranks,
@@ -2084,7 +2100,7 @@ void DistributedFusedLambKernel(
                                    dev_ctx,
                                    comm_ctx,
                                    fp32_scale);
-        NCCLReduceScatterWithScale(fp16_grad_data,
+        mcclReduceScatterWithScale(fp16_grad_data,
                                    fp16_sum_grad,
                                    fp16_numel_each_device,
                                    nranks,
@@ -2109,11 +2125,11 @@ void DistributedFusedLambKernel(
         // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
         // but fp32_square_grad_norm is allocated by Buffer.
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::ncclAllReduce(fp32_square_grad_norm,
+            phi::dynload::mcclAllReduce(fp32_square_grad_norm,
                                         fp32_square_grad_norm,
                                         1,
-                                        ncclFloat32,
-                                        ncclSum,
+                                        mcclFloat32,
+                                        mcclSum,
                                         local_comm,
                                         stream));
         VLOG(1) << "Grad square norm after all reduce: "
@@ -2126,7 +2142,7 @@ void DistributedFusedLambKernel(
   } else {
     if (local_shard) {
       if (use_hierarchical_allreduce) {
-        NCCLReduceScatterWithScale(
+        mcclReduceScatterWithScale(
             fp32_grad_data,
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_numel_each_device,
@@ -2135,7 +2151,7 @@ void DistributedFusedLambKernel(
             stream,
             dev_ctx,
             local_comm_ctx);
-        NCCLAllReduceWithScale(
+        mcclAllReduceWithScale(
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_sum_grad + local_rank * fp32_numel_each_device,
             fp32_numel_each_device,
@@ -2144,7 +2160,7 @@ void DistributedFusedLambKernel(
             stream,
             dev_ctx,
             external_comm_ctx);
-        NCCLReduceScatterWithScale(
+        mcclReduceScatterWithScale(
             fp16_grad_data,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_numel_each_device,
@@ -2153,7 +2169,7 @@ void DistributedFusedLambKernel(
             stream,
             dev_ctx,
             local_comm_ctx);
-        NCCLAllReduceWithScale(
+        mcclAllReduceWithScale(
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_sum_grad + local_rank * fp16_numel_each_device,
             fp16_numel_each_device,
@@ -2163,7 +2179,7 @@ void DistributedFusedLambKernel(
             dev_ctx,
             external_comm_ctx);
       } else {
-        NCCLAllReduceWithScale(fp32_grad_data,
+        mcclAllReduceWithScale(fp32_grad_data,
                                fp32_sum_grad,
                                fp32_numel,
                                nranks,
@@ -2171,7 +2187,7 @@ void DistributedFusedLambKernel(
                                stream,
                                dev_ctx,
                                comm_ctx);
-        NCCLAllReduceWithScale(fp16_grad_data,
+        mcclAllReduceWithScale(fp16_grad_data,
                                fp16_sum_grad,
                                fp16_numel,
                                nranks,
@@ -2183,7 +2199,7 @@ void DistributedFusedLambKernel(
       fp32_sum_grad += (local_rank * fp32_numel_each_device);
       fp16_sum_grad += (local_rank * fp16_numel_each_device);
     } else {
-      NCCLReduceScatterWithScale(fp32_grad_data,
+      mcclReduceScatterWithScale(fp32_grad_data,
                                  fp32_sum_grad,
                                  fp32_numel_each_device,
                                  num_devices,
@@ -2191,7 +2207,7 @@ void DistributedFusedLambKernel(
                                  stream,
                                  dev_ctx,
                                  comm_ctx);
-      NCCLReduceScatterWithScale(fp16_grad_data,
+      mcclReduceScatterWithScale(fp16_grad_data,
                                  fp16_sum_grad,
                                  fp16_numel_each_device,
                                  num_devices,
@@ -2211,11 +2227,11 @@ void DistributedFusedLambKernel(
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but fp32_square_grad_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(fp32_square_grad_norm,
+          phi::dynload::mcclAllReduce(fp32_square_grad_norm,
                                       fp32_square_grad_norm,
                                       1,
-                                      ncclFloat32,
-                                      ncclSum,
+                                      mcclFloat32,
+                                      mcclSum,
                                       local_comm,
                                       stream));
     }
@@ -2357,26 +2373,26 @@ void DistributedFusedLambKernel(
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but param_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(param_square_norm + fp32_global_param_num,
+          phi::dynload::mcclAllReduce(param_square_norm + fp32_global_param_num,
                                       param_square_norm + fp32_global_param_num,
                                       2 * param_num - fp32_global_param_num,
-                                      ncclFloat32,
-                                      ncclSum,
+                                      mcclFloat32,
+                                      mcclSum,
                                       local_comm,
                                       stream));
     } else {
       // TODO(BeingGod): NCCLCommContext::AllReduce only accept DenseTensor,
       // but trust_ratio_div_square_norm is allocated by Buffer.
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(trust_ratio_div_square_norm,
+          phi::dynload::mcclAllReduce(trust_ratio_div_square_norm,
                                       trust_ratio_div_square_norm,
                                       param_num,
-                                      ncclFloat32,
-                                      ncclSum,
+                                      mcclFloat32,
+                                      mcclSum,
                                       local_comm,
                                       stream));
     }
-    VLOG(10) << "ncclAllReduce done";
+    VLOG(10) << "mcclAllReduce done";
   }
 
   LogParamAndTrustRatioDivSquareNorm<1>(
@@ -2401,7 +2417,7 @@ void DistributedFusedLambKernel(
         beta1,
         beta2);
     if (num_devices > 1) {
-      // ncclAllGather
+      // mcclAllGather
       if (local_comm_ctx) {
         auto send_buf = distributed::GetPartialTensor(
             *fp32_param_out, fp32_offset, fp32_numel_each_device);
@@ -2410,10 +2426,10 @@ void DistributedFusedLambKernel(
         local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::ncclAllGather(fp32_param_data + fp32_offset,
+            phi::dynload::mcclAllGather(fp32_param_data + fp32_offset,
                                         fp32_param_data,
                                         fp32_numel_each_device,
-                                        ncclFloat32,
+                                        mcclFloat32,
                                         local_comm,
                                         stream));
       }
@@ -2439,7 +2455,7 @@ void DistributedFusedLambKernel(
         beta1,
         beta2);
     if (num_devices > 1) {
-      // ncclAllGather
+      // mcclAllGather
       if (local_comm_ctx) {
         auto send_buf = distributed::GetPartialTensor(
             *fp16_param_out, fp16_offset, fp16_numel_each_device);
@@ -2448,10 +2464,10 @@ void DistributedFusedLambKernel(
         local_comm_ctx->AllGather(&recv_buf, send_buf, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::ncclAllGather(fp16_param_data + fp16_offset,
+            phi::dynload::mcclAllGather(fp16_param_data + fp16_offset,
                                         fp16_param_data,
                                         fp16_numel_each_device,
-                                        ncclFloat16,
+                                        mcclFloat16,
                                         local_comm,
                                         stream));
       }
diff --git a/paddle/fluid/operators/optimizers/sparse_momentum_op.h b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
index 4c47fd2b621784..13d925bbe19a19 100644
--- a/paddle/fluid/operators/optimizers/sparse_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/sparse_momentum_op.h
@@ -25,7 +25,8 @@
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/phi/common/amp_type_traits.h"
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -461,7 +462,7 @@ class SparseMomentumOpKernel : public framework::OpKernel<T> {
         grad_index.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       auto sort_value_ptr =
           sort_value.mutable_data<IndexT>({num_index}, ctx.GetPlace());
 
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op.cc b/paddle/fluid/operators/pscore/send_and_recv_op.cc
index 4f118565396e11..cc11601be0be61 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op.cc
@@ -107,7 +107,7 @@ PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           double,
                           int,
                           int64_t) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(send_and_recv,
                           GPU,
                           ALL_LAYOUT,
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index ebdddfd41b33f5..b9f05d663dba08 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -246,7 +246,7 @@ PD_REGISTER_STRUCT_KERNEL(
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss_grad, CPU, ALL_LAYOUT, ops::RankLossGradKernel, float) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_STRUCT_KERNEL(
     rank_loss, GPU, ALL_LAYOUT, ops::RankLossKernel, float) {}
 PD_REGISTER_STRUCT_KERNEL(
diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index b73ffe4319be78..24457c24a54ace 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -48,7 +48,7 @@ BufferedReader::BufferedReader(
       buffer_size_(buffer_size),
       pin_memory_(pin_memory) {
   VLOG(1) << "BufferedReader";
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (platform::is_gpu_place(place_) && !pin_memory) {
     int dev_idx = place_.device;  // NOLINT
     compute_stream_ =
@@ -118,7 +118,7 @@ void BufferedReader::ReadAsync(size_t i) {
       return -1UL;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  // @{ Group GPU Place
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)  // @{ Group GPU Place
     if (platform::is_gpu_place(place_)) {
       TensorVec &cuda = cuda_buffer_[i];
       if (cuda.empty()) {
@@ -197,6 +197,11 @@ void BufferedReader::ReadAsync(size_t i) {
             hipEventRecord(events_[i].get(), compute_stream_));
         PADDLE_ENFORCE_GPU_SUCCESS(
             hipStreamWaitEvent(stream_.get(), events_[i].get(), 0));
+#elif defined(PADDLE_WITH_MUSA)     
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            musaEventRecord(events_[i].get(), compute_stream_));
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            musaStreamWaitEvent(stream_.get(), events_[i].get(), 0));       
 #else
         PADDLE_ENFORCE_GPU_SUCCESS(
             cudaEventRecord(events_[i].get(), compute_stream_));
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index 032a74b7e23f14..db849dc70b5da9 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -21,7 +21,7 @@
 
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/reader.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
@@ -80,7 +80,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> xpu_buffer_;
   std::vector<TensorVec> custom_device_buffer_;
   size_t prev_pos_{-1UL};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuStream_t compute_stream_;
   std::shared_ptr<platform::CudaStreamObject> stream_;
   std::vector<std::shared_ptr<platform::CudaEventObject>> events_;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index e69492501c1173..d0bde6af204893 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -28,7 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/phi_utils.h"
 #include "paddle/phi/kernels/cpu/reduce.h"
 
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #include "paddle/phi/kernels/gpu/reduce_grad.h"
 #endif
@@ -757,7 +757,7 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
   virtual std::string GetOpType() const = 0;
 };
 
-#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__)
+#if defined(__HIPCC__) || defined(__NVCC__) || defined(__xpu__) || defined(__MUSACC__)
 template <typename T,
           template <typename>
           class ReduceBaseOp,
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 30d4fb0cf9ad4c..1a26271a97f225 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -429,7 +429,7 @@ class ReshapeKernel {
                               pt_scalar_shape,
                               out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeInferKernel(static_cast<const phi::GPUContext &>(dev_ctx),
@@ -462,7 +462,7 @@ class ReshapeGradKernel {
       phi::ReshapeGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, d_x);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeGradKernel(
@@ -492,7 +492,7 @@ class ReshapeDoubleGradKernel {
       phi::ReshapeDoubleGradKernel(
           static_cast<const phi::CPUContext &>(dev_ctx), *d_out, *dd_x, dd_out);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto &dev_ctx = ctx.device_context<phi::GPUContext>();
       phi::ReshapeDoubleGradKernel(
@@ -764,7 +764,7 @@ REGISTER_OPERATOR(reshape2_grad_grad,
                   ops::ReshapeDoubleGradOpNoNeedBufferVarInferer,
                   Reshape2DoubleGradInferShapeFunctor);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 REGISTER_OP_CUDA_KERNEL_FUNCTOR(reshape,
                                 float,
                                 ops::ReshapeKernel,
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index f025d278074215..14b86627c3825d 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -121,7 +121,7 @@ PD_REGISTER_KERNEL(save_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(save,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/fluid/operators/select_op_helper.h b/paddle/fluid/operators/select_op_helper.h
index 2b7f884f6170c3..7e3de57345a4bc 100644
--- a/paddle/fluid/operators/select_op_helper.h
+++ b/paddle/fluid/operators/select_op_helper.h
@@ -39,7 +39,7 @@ inline int GetBranchNumber(const phi::DenseTensor &mask) {
   }
   // when platform::is_gpu_place(mask.place()) is true
   std::unique_ptr<phi::DenseTensor> cpu_mask{new phi::DenseTensor()};
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
   framework::TensorCopySync(mask, platform::CPUPlace(), cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
index 2236988025cbc3..13133e54f04152 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reverse_op.h
@@ -136,7 +136,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     const size_t *lod;
     size_t lod_count = x.lod()[0].size();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_gpu_place(ctx.GetPlace())) {
       auto xlod = x.lod()[0];
       phi::MixVector<size_t> mixv_xlod(&xlod);
@@ -144,7 +144,7 @@ class SequenceReverseOpKernel : public framework::OpKernel<T> {
     } else {
 #endif
       lod = x.lod()[0].data();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     }
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index 01f7bb3e928902..316f8a55cc8034 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -26,44 +26,8 @@ template <typename T>
 class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* out = ctx.Output<LoDTensor>("Out");
+    PADDLE_ENFORCE(false,"not support");
 
-    auto& lod = x->lod();
-    auto& dims = x->dims();
-
-    const size_t level = lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        dims[0],
-        static_cast<int64_t>(lod[level].back()),
-        platform::errors::InvalidArgument(
-            "The first dimension of Input(X) should be equal to the sum of all "
-            "sequences' lengths. But received first dimension of Input(X) is "
-            "%d, the sum of all sequences' lengths is %d.",
-            dims[0],
-            static_cast<int64_t>(lod[level].back())));
-    PADDLE_ENFORCE_EQ(dims[0],
-                      x->numel(),
-                      platform::errors::InvalidArgument(
-                          "The width of each timestep in Input(X) of "
-                          "SequenceSoftmaxOp should be 1."));
-
-    out->mutable_data<T>(ctx.GetPlace());
-    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-      Tensor x_i = x->Slice(start_pos, end_pos);
-      Tensor out_i = out->Slice(start_pos, end_pos);
-
-      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims_i =
-          // common::make_ddim({1UL, end_pos - start_pos, 1UL, 1UL});
-          common::make_ddim({1UL, end_pos - start_pos});
-      x_i.Resize(dims_i);
-      out_i.Resize(dims_i);
-      phi::funcs::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
-          ctx.template device_context<phi::GPUContext>(), &x_i, &out_i);
-    }
   }
 };
 
@@ -71,36 +35,7 @@ template <typename T>
 class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* out = ctx.Input<LoDTensor>("Out");
-    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* x = ctx.Input<LoDTensor>("X");
-    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    if (x_grad) {
-      x_grad->set_lod(x->lod());
-    }
-    auto& lod = x->lod();
-    const size_t level = lod.size() - 1;
-
-    x_grad->mutable_data<T>(ctx.GetPlace());  // NOLINT
-    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
-      int start_pos = static_cast<int>(lod[level][i]);
-      int end_pos = static_cast<int>(lod[level][i + 1]);
-
-      Tensor out_i = out->Slice(start_pos, end_pos);
-      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
-      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
-
-      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
-      framework::DDim dims_i = common::make_ddim({1UL, end_pos - start_pos});
-      out_i.Resize(dims_i);
-      out_grad_i.Resize(dims_i);
-      x_grad_i.Resize(dims_i);
-      phi::funcs::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
-          ctx.template device_context<phi::GPUContext>(),
-          &out_i,
-          &out_grad_i,
-          &x_grad_i);
-    }
+    PADDLE_ENFORCE(false,"not support");
   }
 };
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 12d4f72a91169e..a037d0dcf73ccf 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <string>
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #endif
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
index 40a7a451a6e21a..3262bef2bf5e93 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu
@@ -17,6 +17,10 @@ limitations under the License. */
 #include <cub/cub.cuh>
 #endif
 
+#ifdef __MUSACC__
+#include <cub/cub.cuh>
+#endif
+
 #ifdef __HIPCC__
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 6b79d5c35b7838..a1e4a328cf439f 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -69,7 +69,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
     if (platform::is_cpu_place(mask.place())) {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       framework::TensorCopy(
           mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index caa31565d4cf3d..c2911806996ce5 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -478,7 +478,7 @@ struct DeviceIndependenceTensorOperations {
     std::vector<int> out_shape = GetBroadcastShape({&x, &y});
     ret.Resize(common::make_ddim(out_shape));
     if (platform::is_gpu_place(context.GetPlace())) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
       // For GPU, there is no need to define XxxInverseFunctor and call
       // ElementwiseComputeEx in two branches.
       ElementwiseComputeEx<SubFunctor<InT>, DeviceContext, InT>(
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
index af69594f992cde..ced7b82cb3a9f5 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -104,8 +104,8 @@ void SyncBatchNormKernel(const Context& ctx,
           <<<grid, threads, 0, stream>>>(x_d, N, H * W * D, C, stats);
     }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    ncclComm_t comm = static_cast<ncclComm_t>(detail::GetCCLComm(x.place(), 0));
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+    mcclComm_t comm = static_cast<mcclComm_t>(detail::GetCCLComm(x.place(), 0));
     if (comm == nullptr) {
       comm = ctx.nccl_comm();
     }
@@ -114,11 +114,11 @@ void SyncBatchNormKernel(const Context& ctx,
       int dtype = phi::ToNCCLDataType(mean_out->dtype());
       // In-place operation
       PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::ncclAllReduce(stats,
+          phi::dynload::mcclAllReduce(stats,
                                       stats,
                                       2 * C + 1,
-                                      static_cast<ncclDataType_t>(dtype),
-                                      ncclSum,
+                                      static_cast<mcclDataType_t>(dtype),
+                                      mcclSum,
                                       comm,
                                       stream));
       VLOG(3) << "Sync result using all reduce";
diff --git a/paddle/fluid/operators/sync_batch_norm_utils.h b/paddle/fluid/operators/sync_batch_norm_utils.h
index c132a91bb5346c..21f1052e03a289 100644
--- a/paddle/fluid/operators/sync_batch_norm_utils.h
+++ b/paddle/fluid/operators/sync_batch_norm_utils.h
@@ -19,7 +19,8 @@ limitations under the License. */
 #include <cmath>
 #include <string>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -27,7 +28,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/distributed/collective/process_group.h"
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 #include "paddle/common/layout.h"
@@ -570,9 +571,9 @@ void SyncBatchNormGradFunctor(
     }
   }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   int global_gid = 0;
-  ncclComm_t comm = nullptr;
+  mcclComm_t comm = nullptr;
 
   if (paddle::distributed::ProcessGroupMapFromGid::getInstance()->has(
           global_gid)) {
@@ -588,11 +589,11 @@ void SyncBatchNormGradFunctor(
     int dtype = paddle::platform::ToNCCLDataType(scale.dtype());
     // In-place operation
     PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::ncclAllReduce(stats,
+        phi::dynload::mcclAllReduce(stats,
                                     stats,
                                     2 * C + 1,
-                                    static_cast<ncclDataType_t>(dtype),
-                                    ncclSum,
+                                    static_cast<mcclDataType_t>(dtype),
+                                    mcclSum,
                                     comm,
                                     stream));
     VLOG(3) << "Sync result using all reduce";
diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu
index ef6172b6965f22..63d8614f3c3697 100644
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -15,7 +15,8 @@ limitations under the License. */
 #pragma once
 #include <cstdio>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index 458794223dc743..20fe009e4c0912 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 #include <thrust/random.h>
 
 #include "paddle/phi/core/generator.h"
@@ -113,7 +113,7 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
   return vec_new_shape;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__) || defined(__MUSACC__)
 
 template <typename T>
 struct UniformGenerator {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 113ba40ec0cf31..1aaafb99cf9696 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -64,7 +64,7 @@ if(WITH_DGC)
   set(dgc_deps dgc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 endif()
 
@@ -90,8 +90,14 @@ if(WITH_ROCM)
     SRCS stream_callback_manager.cc
     DEPS simple_threadpool enforce common)
 endif()
+if(WITH_MUSA)
+  musa_library(
+    stream_callback_manager
+    SRCS stream_callback_manager.cc
+    DEPS simple_threadpool enforce common)
+endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   set(STREAM_CALLBACK_DEPS stream_callback_manager)
 else()
   set(STREAM_CALLBACK_DEPS)
@@ -138,7 +144,7 @@ cc_library(
   SRCS collective_helper.cc gen_comm_id_helper.cc
   DEPS framework_proto device_context enforce common)
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   target_link_libraries(device_context gpu_resource_pool)
 endif()
 
@@ -236,6 +242,31 @@ if(WITH_ROCM)
     DEPS device_context gpu_info)
 endif()
 
+if(WITH_MUSA)
+  musa_library(
+    device_event_gpu
+    SRCS device_event_gpu.cc
+    DEPS device_event_base)
+  set(DEVICE_EVENT_LIBS
+      device_event_gpu
+      CACHE INTERNAL "device event libs")
+  if(WITH_CUSTOM_DEVICE)
+    musa_test(
+      device_event_test
+      SRCS device_event_test.cc
+      DEPS device_event_gpu device_event_custom_device)
+  else()
+    musa_test(
+      device_event_test
+      SRCS device_event_test.cc
+      DEPS device_event_gpu)
+  endif()
+  musa_test(
+    device_context_test
+    SRCS device_context_test.cu
+    DEPS device_context gpu_info)
+endif()
+
 cc_library(timer SRCS timer.cc)
 cc_test(
   timer_test
@@ -285,6 +316,18 @@ elseif(WITH_ROCM)
          stats
          op_proto_maker
          shape_inference)
+elseif(WITH_MUSA)
+  musa_library(
+    profiler
+    SRCS profiler.cc profiler.cu
+    DEPS phi
+        common
+        gpu_info
+        enforce
+        new_profiler
+        stats
+        op_proto_maker
+        shape_inference)
 elseif(WITH_XPU)
   cc_library(
     profiler
@@ -365,8 +408,23 @@ if(WITH_ROCM)
     DEPS gpu_info)
 endif()
 
+if(WITH_MUSA)
+  musa_test(
+    float16_gpu_test
+    SRCS float16_test.cu
+    DEPS lod_tensor)
+  musa_test(
+    test_limit_gpu_memory
+    SRCS test_limit_gpu_memory.cu
+    DEPS gpu_info phi common)
+  musa_library(
+    cuda_device_guard
+    SRCS cuda_device_guard.cc
+    DEPS gpu_info)
+endif()
+
 if(NOT APPLE AND NOT WIN32)
-  if(WITH_GPU OR WITH_ROCM)
+  if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
     cc_test(
       device_code_test
       SRCS device_code_test.cc
diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc
index 4ffcf53b1a5747..0c322075018983 100644
--- a/paddle/fluid/platform/collective_helper.cc
+++ b/paddle/fluid/platform/collective_helper.cc
@@ -23,7 +23,7 @@
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 class NCCLCommImpl : public NCCLComm {
  public:
   void set_ring_id(int ring_id) { ring_id_ = ring_id; }
@@ -37,8 +37,8 @@ class NCCLCommImpl : public NCCLComm {
 
   int device_id() const override { return dev_ctx_->GetPlace().device; }
 
-  void set_comm(ncclComm_t comm) { comm_ = comm; }
-  ncclComm_t comm() const override { return comm_; }
+  void set_comm(mcclComm_t  comm) { comm_ = comm; }
+  mcclComm_t  comm() const override { return comm_; }
 
   gpuStream_t stream() const override { return dev_ctx_->stream(); }
 
@@ -64,7 +64,7 @@ class NCCLCommImpl : public NCCLComm {
   int ring_id_;
   int nranks_;
   int rank_;
-  ncclComm_t comm_;
+  mcclComm_t comm_;
   std::unique_ptr<phi::GPUContext> dev_ctx_;
 
   // used for comm wait compute, compute_stream-->event-->comm_stream
@@ -80,7 +80,7 @@ NCCLCommContext& NCCLCommContext::Instance() {
 }
 
 NCCLComm* NCCLCommContext::CreateComm(
-    ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) {
+    mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) {
   PADDLE_ENFORCE_NOT_NULL(nccl_id,
                           platform::errors::InvalidArgument(
                               "The nccl unique id should not be null."));
@@ -106,10 +106,10 @@ NCCLComm* NCCLCommContext::CreateComm(
       platform::errors::InvalidArgument(
           "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
 
-  ncclComm_t comm = nullptr;
+  mcclComm_t  comm = nullptr;
   SetDeviceId(dev_id);
   PADDLE_ENFORCE_GPU_SUCCESS(
-      platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank));
+      platform::dynload::mcclCommInitRank(&comm, nranks, *nccl_id, rank));
 
   auto* comm_wrapper = AssignNCCLComm(comm, nranks, rank, dev_id, ring_id);
 
@@ -133,8 +133,8 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
                                         dev_ids.size()));
 
   const int kDevices = dev_ids.size();
-  ncclComm_t comms[kDevices];
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclCommInitAll(
+  mcclComm_t comms[kDevices];
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclCommInitAll(
       comms, dev_ids.size(), dev_ids.data()));
 
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id),
@@ -156,7 +156,7 @@ void NCCLCommContext::CreateAllNCCLComms(const std::vector<int>& dev_ids,
 
 void NCCLCommContext::CreateNCCLCommMultiTrainer(
     const std::vector<int>& dev_ids,
-    ncclUniqueId* nccl_id,
+    mcclUniqueId* nccl_id,
     int ntrainers,
     int train_id,
     int ring_id) {
@@ -169,20 +169,22 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
   VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices
           << ", ntrainers: " << ntrainers << ", train_id: " << train_id
           << ", rind_id: " << ring_id;
-  ncclComm_t comms[kDevices];
+  mcclComm_t comms[kDevices];
   {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart());
     for (int i = 0; i < kDevices; i++) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipSetDevice(i));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaSetDevice(i)); 
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(i));
 #endif
-      platform::dynload::ncclCommInitRank(
+      platform::dynload::mcclCommInitRank(
           comms + i, kDevices * ntrainers, *nccl_id, train_id * kDevices + i);
       VLOG(1) << "ncclCommInitRank: " << i;
     }
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd());
     VLOG(1) << "nccl group end seccessss";
   }
   PADDLE_ENFORCE_EQ(comm_map_.count(ring_id),
@@ -208,7 +210,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer(
 }
 
 NCCLComm* NCCLCommContext::AssignNCCLComm(
-    ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id) {
+    mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id) {
   std::unique_ptr<phi::GPUContext> dev_ctx(
       new phi::GPUContext(CUDAPlace(dev_id)));
   dev_ctx->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 6636856a0eb6ce..d88e6e69fba50b 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -28,10 +28,10 @@
 namespace paddle {
 namespace platform {
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 // In order to apply hierarchical communication with NCCL, we need
 // a communication ring contains NCCL communicators associated to a global
-// ncclUniqueId. E.g. for a hierarchical case,
+// mcclUniqueId. E.g. for a hierarchical case,
 //
 //    11 - 12   21 - 22
 //     |    |    |    |
@@ -55,7 +55,7 @@ class NCCLComm {
   virtual int nranks() const = 0;
   virtual int rank() const = 0;
   virtual int device_id() const = 0;
-  virtual ncclComm_t comm() const = 0;
+  virtual mcclComm_t comm() const = 0;
   virtual gpuStream_t stream() const = 0;
   virtual gpuEvent_t compute_event() const = 0;
   virtual gpuEvent_t comm_event() const = 0;
@@ -69,12 +69,12 @@ class NCCLCommContext {
   static NCCLCommContext& Instance();
 
   NCCLComm* CreateComm(
-      ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0);
+      mcclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0);
 
   void CreateAllNCCLComms(const std::vector<int>& dev_ids, int ring_id = 0);
 
   void CreateNCCLCommMultiTrainer(const std::vector<int>& dev_ids,
-                                  ncclUniqueId* nccl_id,
+                                  mcclUniqueId* nccl_id,
                                   int nranks,
                                   int rank,
                                   int ring_id);
@@ -82,7 +82,7 @@ class NCCLCommContext {
   // a latter comm with the same dev_id and the same ring_id
   // will override the former
   NCCLComm* AssignNCCLComm(
-      ncclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0);
+      mcclComm_t comm, int nranks, int rank, int dev_id, int ring_id = 0);
 
   // retrieve a communicator by the ring id in multiprocessing mode
   NCCLComm* Get(int ring_id) const {
@@ -99,7 +99,7 @@ class NCCLCommContext {
     return comm_map_.at(ring_id).begin()->second.get();
   }
 
-  int GetRingId(ncclComm_t comm) const {
+  int GetRingId(mcclComm_t comm) const {
     for (const auto& pair : comm_map_) {
       for (const auto& p : pair.second) {
         if (p.second.get()->comm() == comm) {
diff --git a/paddle/fluid/platform/device/CMakeLists.txt b/paddle/fluid/platform/device/CMakeLists.txt
index 6f0d86f0a4b176..b782a45047117b 100644
--- a/paddle/fluid/platform/device/CMakeLists.txt
+++ b/paddle/fluid/platform/device/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(DEV_LIBS custom_device)
 
 # GPU
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   add_subdirectory(gpu)
 endif()
 
diff --git a/paddle/fluid/platform/device/device_wrapper.h b/paddle/fluid/platform/device/device_wrapper.h
index aa2dba03c90824..bcfb316837a302 100644
--- a/paddle/fluid/platform/device/device_wrapper.h
+++ b/paddle/fluid/platform/device/device_wrapper.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt
index 65c3fb20631675..3176d042b7146d 100644
--- a/paddle/fluid/platform/device/gpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt
@@ -28,6 +28,18 @@ elseif(WITH_ROCM)
     cudnn_desc_test
     SRCS cudnn_desc_test.cc
     DEPS dynload_cuda)
+elseif(WITH_MUSA)
+  # add_subdirectory(musa)
+  musa_library(
+    gpu_info
+    SRCS gpu_info.cc
+    DEPS phi common glog enforce monitor dynload_cuda)
+
+  musa_test(cuda_helper_test SRCS cuda_helper_test.cu)
+  musa_test(
+    cudnn_desc_test
+    SRCS cudnn_desc_test.cc
+    DEPS dynload_cuda)
 endif()
 
 cc_library(
diff --git a/paddle/fluid/platform/device/gpu/gpu_helper.h b/paddle/fluid/platform/device/gpu/gpu_helper.h
index 878a122a492243..f94f5d55b7eeef 100644
--- a/paddle/fluid/platform/device/gpu/gpu_helper.h
+++ b/paddle/fluid/platform/device/gpu/gpu_helper.h
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/device/gpu/rocm/rocm_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/fluid/platform/device/gpu/musa/musa_helper.h"
 #else
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_helper.h"
 #include "paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h"
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 3a26b73e64b772..f82d836e83e770 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -35,6 +35,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/platform/dynload/miopen.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/fluid/platform/dynload/mudnn.h"
 #else
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/phi/backends/gpu/cuda/cuda_graph.h"
@@ -215,6 +217,12 @@ class RecordedGpuMallocHelper {
     } else {
       result = hipMalloc(ptr, size);
     }
+#elif defined(PADDLE_WITH_MUSA)
+    if (UNLIKELY(malloc_managed_memory)) {
+      result = musaMallocManaged(ptr, size);
+    } else {
+      result = musaMalloc(ptr, size);
+    }
 #else
     phi::backends::gpu::CUDAGraphCaptureModeGuard capture_mode_guard;
     if (UNLIKELY(malloc_managed_memory)) {
@@ -260,6 +268,9 @@ class RecordedGpuMallocHelper {
 #ifdef PADDLE_WITH_HIP
     auto err = hipFree(ptr);
     if (err != hipErrorDeinitialized) {
+#elif defined(PADDLE_WITH_MUSA)
+    auto err = musaFree(ptr);
+    if (err != musaErrorMusartUnloading) {
 #else
     auto err = cudaFree(ptr);
     VLOG(10) << "[cudaFree] size=" << static_cast<double>(size) / (1 << 20)
@@ -306,6 +317,8 @@ class RecordedGpuMallocHelper {
       CUDADeviceGuard guard(dev_id_);
 #ifdef PADDLE_WITH_HIP
       auto result = hipMemGetInfo(actual_avail, actual_total);
+#elif defined(PADDLE_WITH_MUSA)
+      auto result = musaMemGetInfo(actual_avail, actual_total);
 #else
       auto result = cudaMemGetInfo(actual_avail, actual_total);
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index b5a00e9257a80e..a2fe54ae4dca4f 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)  || defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_launch_config.h b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
index 98c6e379342f25..018fee5f7416f8 100644
--- a/paddle/fluid/platform/device/gpu/gpu_launch_config.h
+++ b/paddle/fluid/platform/device/gpu/gpu_launch_config.h
@@ -16,10 +16,12 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
index 9f2168e1cdb8b0..0fb7e061e3243c 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -30,6 +30,9 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamCreateWithFlags(&stream, musaStreamNonBlocking));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@@ -41,6 +44,8 @@ CudaStreamResourcePool::CudaStreamResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -82,6 +87,9 @@ CudaEventResourcePool::CudaEventResourcePool() {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipEventCreateWithFlags(&event, hipEventDisableTiming));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaEventCreateWithFlags(&event, musaEventDisableTiming));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
@@ -93,6 +101,8 @@ CudaEventResourcePool::CudaEventResourcePool() {
       platform::SetDeviceId(dev_idx);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaEventDestroy(event));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
index 2ac13e692f7837..17e649b9ac62a8 100644
--- a/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
+++ b/paddle/fluid/platform/device/gpu/gpu_resource_pool.h
@@ -14,13 +14,16 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #endif
-
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/device/gpu/gpu_types.h b/paddle/fluid/platform/device/gpu/gpu_types.h
index c9afafdef7166c..df8b87ed3a0365 100644
--- a/paddle/fluid/platform/device/gpu/gpu_types.h
+++ b/paddle/fluid/platform/device/gpu/gpu_types.h
@@ -15,14 +15,19 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 
 #include "paddle/fluid/platform/dynload/miopen.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 
+#include "paddle/fluid/platform/dynload/mublas.h"
+#include "paddle/fluid/platform/dynload/mudnn.h"
+#include "paddle/phi/backends/gpu/forwards.h"
 #else
 #include <cuda_runtime.h>
 
@@ -34,78 +39,95 @@
 namespace paddle {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
 #else  // CDUA
-
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
 
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
-
-DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
-DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
-                     cudnnActivationStruct,
-                     miopenActivationDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
-                     cudnnActivationMode_t,
-                     miopenActivationMode_t);
-DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
-                     cudnnTensorStruct,
-                     miopenTensorDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
-                     cudnnTensorFormat_t,
-                     miopenTensorFormat_t);
-DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
-                     cudnnFilterStruct,
-                     miopenTensorDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
-                     cudnnFilterDescriptor_t,
-                     miopenTensorDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
-                     cudnnConvolutionStruct,
-                     miopenConvolutionDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
-                     cudnnConvolutionDescriptor_t,
-                     miopenConvolutionDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
-                     cudnnPoolingDescriptor_t,
-                     miopenPoolingDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
-DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
-                     cudnnDropoutDescriptor_t,
-                     miopenDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
-
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t, musaStream_t);
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t, musaEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind, musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t, musaDeviceProp);
+
+
+  // DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t, mudnnDataType_t);
+  // DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+  //                      cudnnActivationStruct,
+  //                      miopenActivationDescriptor,
+  //                      mudnnActivationStruct);
+  // DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
+  //                      cudnnActivationMode_t,
+  //                      miopenActivationMode_t,
+  //                      mudnnActivationMode_t);
+  // DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+  //                      cudnnTensorStruct,
+  //                      miopenTensorDescriptor,
+  //                      mudnnTensorStruct);
+  // DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
+  //                      cudnnTensorFormat_t,
+  //                      miopenTensorFormat_t,
+  //                      mudnnTensorFormat_t);
+  // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+  //                      cudnnFilterStruct,
+  //                      miopenTensorDescriptor,
+  //                      mudnnFilterStruct);
+  // DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+  //                      cudnnFilterDescriptor_t,
+  //                      miopenTensorDescriptor_t,
+  //                      mudnnFilterDescriptor_t);
+  // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+  //                      cudnnConvolutionStruct,
+  //                      miopenConvolutionDescriptor,
+  //                      mudnnConvolutionStruct);
+  // DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+  //                      cudnnConvolutionDescriptor_t,
+  //                      miopenConvolutionDescriptor_t,
+  //                      mudnnConvolutionDescriptor_t);
+  // DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+  //                      cudnnPoolingDescriptor_t,
+  //                      miopenPoolingDescriptor_t,
+  //                      mudnnPoolingDescriptor_t);
+  // DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t,mudnnPoolingMode_t);MUDNN_DNN_ROUTINE_EACH
+  // DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+  //                      cudnnDropoutDescriptor_t,
+  //                      miopenDropoutDescriptor_t,
+  //                      mudnnDropoutDescriptor_t);
+  DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t);
+
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
-DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t);
 
 using CUDAGraphID = unsigned long long;  // NOLINT
 
 #undef DECLARE_TYPE_FOR_GPU
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+  constexpr auto GPU_CV = MUSA_CV;
 #else  // CDUA
 
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+                         hipErrorOutOfMemory,
+                         musaErrorMemoryAllocation);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady, musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/musa/musa_helper.h b/paddle/fluid/platform/device/gpu/musa/musa_helper.h
new file mode 100644
index 00000000000000..45ded21129a5ad
--- /dev/null
+++ b/paddle/fluid/platform/device/gpu/musa/musa_helper.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/fluid/platform/dynload/mublas.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+ */
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
+
+class CublasHandleHolder {
+ public:
+  explicit CublasHandleHolder(musaStream_t stream) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasCreate(&handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasSetStream(handle_, stream));
+  }
+
+  const mublasHandle_t& GetCublasHandle() const { return handle_; }
+
+  ~CublasHandleHolder() PADDLE_MAY_THROW {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::mublasDestroy(handle_));
+  }
+
+  template <typename Callback>
+  inline void Call(Callback&& callback) const {
+    std::lock_guard<std::mutex> guard(mtx_);
+    callback(handle_);
+  }
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(CublasHandleHolder);
+
+  mublasHandle_t handle_;
+  mutable std::mutex mtx_;
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 8afcfc9f2b7005..db5bcbc08c5de6 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MUSA)
 #include <stdio.h>
 
 #include <memory>
@@ -29,9 +29,15 @@
 #ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
+#ifdef PADDLE_WITH_MCCL
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
 #ifdef PADDLE_WITH_RCCL
 #include "paddle/fluid/platform/dynload/rccl.h"
 #endif
+#ifdef PADDLE_WITH_MCCL
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/bfloat16.h"
@@ -44,63 +50,63 @@
 namespace paddle {
 namespace platform {
 
-inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
+inline mcclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
   if (type == framework::proto::VarType::FP32) {
-    return ncclFloat;
+    return mcclFloat;
   } else if (type == framework::proto::VarType::FP64) {
-    return ncclDouble;
+    return mcclFloat;
   } else if (type == framework::proto::VarType::INT32) {
-    return ncclInt;
+    return mcclInt;
   } else if (type == framework::proto::VarType::INT64) {
-    return ncclInt64;
+    return mcclInt64;
   } else if (type == framework::proto::VarType::FP16) {
-    return ncclFloat16;
+    return mcclFloat16;
   } else if (type == framework::proto::VarType::INT8) {
-    return ncclInt8;
+    return mcclInt8;
   } else if (type == framework::proto::VarType::UINT8) {
-    return ncclUint8;
+    return mcclUint8;
   } else if (type == framework::proto::VarType::BOOL) {
-    return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
-  } else if (type == framework::proto::VarType::BF16) {
-    return ncclBfloat16;
-#endif
+    return mcclUint8;
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+  // } else if (type == framework::proto::VarType::BF16) {
+  //   return mcclBfloat16;
+// #endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
   }
 }
 
-inline ncclDataType_t ToNCCLDataType(phi::DataType type) {
+inline mcclDataType_t ToNCCLDataType(phi::DataType type) {
   if (type == phi::DataType::FLOAT32) {
-    return ncclFloat;
+    return mcclFloat;
   } else if (type == phi::DataType::FLOAT64) {
-    return ncclDouble;
+    return mcclFloat;
   } else if (type == phi::DataType::INT32) {
-    return ncclInt;
+    return mcclInt;
   } else if (type == phi::DataType::INT64) {
-    return ncclInt64;
+    return mcclInt64;
   } else if (type == phi::DataType::FLOAT16) {
-    return ncclFloat16;
+    return mcclFloat16;
   } else if (type == phi::DataType::UINT8) {
-    return ncclUint8;
+    return mcclUint8;
   } else if (type == phi::DataType::INT8) {
-    return ncclInt8;
+    return mcclInt8;
   } else if (type == phi::DataType::BOOL) {
-    return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
-  } else if (type == phi::DataType::BFLOAT16) {
-    return ncclBfloat16;
-#endif
+    return mcclUint8;
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+  // } else if (type == phi::DataType::BFLOAT16) {
+  //   return mcclBfloat16;
+// #endif
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
   }
 }
 
-// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// NOTE(minqiyang): according to the mcclGroupEnd documentations:
 // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
-// ncclGroupEnd will wait for all communicators to be initialized, which will
+// mcclGroupEnd will wait for all communicators to be initialized, which will
 // cause blocking problem when a runtime_error was thrown, so try only guard
 // NCCL actions when use it.
 class NCCLGroupGuard {
@@ -112,18 +118,18 @@ class NCCLGroupGuard {
 
   inline NCCLGroupGuard() {
     NCCLMutex().lock();
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupStart());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupStart());
   }
 
   inline ~NCCLGroupGuard() PADDLE_MAY_THROW {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclGroupEnd());
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::mcclGroupEnd());
     NCCLMutex().unlock();
   }
 };
 
 struct NCCLContext {
   std::unique_ptr<phi::GPUContext> ctx_;
-  ncclComm_t comm_;
+  mcclComm_t comm_;
 
   explicit NCCLContext(int dev_id) : comm_{nullptr} {
     ctx_.reset(new phi::GPUContext(CUDAPlace(dev_id)));
@@ -150,7 +156,7 @@ struct NCCLContext {
   }
 
   gpuStream_t stream() const { return ctx_->stream(); }
-  ncclComm_t comm() const { return comm_; }
+  mcclComm_t comm() const { return comm_; }
 
   int device_id() const { return ctx_->GetPlace().device; }
 };
@@ -160,7 +166,7 @@ struct NCCLContextMap {
   std::vector<int> order_;
 
   explicit NCCLContextMap(const std::vector<platform::Place> &places,
-                          ncclUniqueId *nccl_id = nullptr,
+                          mcclUniqueId *nccl_id = nullptr,
                           size_t num_trainers = 1,
                           size_t trainer_id = 0) {
     PADDLE_ENFORCE_EQ(!places.empty(),
@@ -179,11 +185,11 @@ struct NCCLContextMap {
         platform::errors::Unavailable("NCCL Context Map does not support "
                                       "contain two or more same device."));
 
-    std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
+    std::unique_ptr<mcclComm_t[]> comms(new mcclComm_t[order_.size()]);
     // if num_trainers == 1, should create a new nccl id for local comms.
     if (num_trainers == 1 && nccl_id == nullptr) {
       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-      PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
+      PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitAll(
           comms.get(), static_cast<int>(order_.size()), order_.data()));
     } else {
       PADDLE_ENFORCE_NOT_NULL(
@@ -203,7 +209,7 @@ struct NCCLContextMap {
           VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
                   << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
           SetDeviceId(gpu_id);
-          PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
+          PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::mcclCommInitRank(
               comms.get() + i, nranks, *nccl_id, rank));
         }
       }
@@ -298,7 +304,7 @@ class NCCLCommunicator {
   }
 
   void InitFlatCtxs(const std::vector<platform::Place> &places,
-                    const std::vector<ncclUniqueId *> &nccl_ids,
+                    const std::vector<mcclUniqueId *> &nccl_ids,
                     size_t trainers_num,
                     size_t trainer_id) {
     if (nccl_ids.size() == 0) {
@@ -330,8 +336,8 @@ class NCCLCommunicator {
   }
 
   void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
-                            const std::vector<ncclUniqueId *> &inter_nccl_ids,
-                            const std::vector<ncclUniqueId *> &exter_nccl_ids,
+                            const std::vector<mcclUniqueId *> &inter_nccl_ids,
+                            const std::vector<mcclUniqueId *> &exter_nccl_ids,
                             size_t trainers_num,
                             size_t trainer_id,
                             size_t inter_trainers_num,
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index c4f40767fd52ce..786b38239e60ef 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/core/expect.h"
 #include "paddle/phi/core/generator.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -53,7 +53,7 @@ DeviceType Place2DeviceType(const platform::Place& place) {
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename DevCtx>
 typename std::enable_if<!std::is_same<DevCtx, phi::GPUContext>::value,
                         DevCtx*>::type
@@ -86,7 +86,7 @@ inline std::unique_ptr<DeviceContext> CreateDeviceContext(
   DevCtx* dev_ctx = ConstructDevCtx<DevCtx>(p, stream_priority);
   auto& instance = paddle::memory::allocation::AllocatorFacade::Instance();
   if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto* cuda_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     PADDLE_ENFORCE_NOT_NULL(
         cuda_ctx,
@@ -184,7 +184,7 @@ void EmplaceDeviceContexts(
           /*unused*/ stream_priority);
 #endif
     } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<phi::GPUContext>(
           place_to_device_context,
           place,
@@ -221,7 +221,7 @@ void EmplaceDeviceContexts(
           "option."));
 #endif
     } else if (platform::is_cuda_pinned_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       EmplaceDeviceContext<CUDAPinnedDeviceContext>(
           place_to_device_context,
           place,
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 4a75d3ea97f9ae..b015bb9a3e6259 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -53,6 +53,18 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"  // NOLINT
 #endif
 
+
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/fluid/platform/device/gpu/gpu_helper.h"  // NOLINT
+#include "paddle/fluid/platform/dynload/mudnn.h"
+#include "paddle/fluid/platform/dynload/mublas.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"  // NOLINT
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include "paddle/fluid/platform/dynload/mccl.h"
+#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"  // NOLINT
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -136,7 +148,7 @@ namespace xpu = baidu::xpu::api;
 using XPUDeviceContext = phi::XPUContext;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 using CUDAPinnedDeviceContext = phi::GPUPinnedContext;
 #endif
 
@@ -165,7 +177,7 @@ struct DefaultDeviceContextType<phi::IPUPlace> {
 };
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPinnedPlace> {
   using TYPE = paddle::platform::CUDAPinnedDeviceContext;
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 402974b89e5c90..cb43f00f7fe0fb 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -31,7 +31,7 @@ using ::paddle::platform::kXPU;
 USE_EVENT(kCPU)
 USE_EVENT_WAIT(kCPU, kCPU)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 USE_EVENT(kCUDA);
 USE_EVENT_WAIT(kCUDA, kCUDA)
 USE_EVENT_WAIT(kCPU, kCUDA)
diff --git a/paddle/fluid/platform/device_event_base.cc b/paddle/fluid/platform/device_event_base.cc
index cd2d31f1fbefb7..c23f395e0e36bb 100644
--- a/paddle/fluid/platform/device_event_base.cc
+++ b/paddle/fluid/platform/device_event_base.cc
@@ -53,6 +53,14 @@ unsigned int GenerateDeviceEventFlag(bool enable_timing,
   return flags;
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+  unsigned int flags =
+      (blocking ? musaEventBlockingSync : musaEventDefault) |
+      (enable_timing ? musaEventDefault : musaEventDisableTiming) |
+      (interprocess ? musaEventInterprocess : musaEventDefault);
+  return flags;
+#endif
+
   return 0;
 }
 
diff --git a/paddle/fluid/platform/device_event_gpu.cc b/paddle/fluid/platform/device_event_gpu.cc
index d64b062cda0acc..bbeb67821e023d 100644
--- a/paddle/fluid/platform/device_event_gpu.cc
+++ b/paddle/fluid/platform/device_event_gpu.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/platform/device_event_base.h"
 #include "paddle/fluid/platform/event.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 namespace paddle {
 namespace platform {
 struct CUDADeviceEventWrapper {
diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt
index 29f7b91a171572..10f582069e6613 100644
--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -22,6 +22,10 @@ endif()
 if(WITH_ROCM)
   list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
 endif()
+if(WITH_MUSA)
+  list(APPEND MUSA_SRCS mublas.cc mudnn.cc murand.cc mufft.cc)
+endif()
+
 
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on MacOS, and only do a early test on Linux and Windows.
@@ -39,6 +43,15 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
+  if(WITH_MUSA)
+    list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
+    if(WITH_MCCL)
+      list(APPEND MUSA_SRCS mccl.cc)
+    endif()
+    if(CUPTI_FOUND)
+      list(APPEND MUSA_SRCS mupti.cc)
+    endif()
+  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -62,6 +75,15 @@ if(WITH_ROCM)
     dynload_warpctc
     SRCS warpctc.cc
     DEPS dynamic_loader warpctc phi common)
+elseif(WITH_MUSA)
+  musa_library(
+    dynload_cuda
+    SRCS ${MUSA_SRCS}
+    DEPS dynamic_loader phi common)
+  cc_library(
+    dynload_warpctc
+    SRCS warpctc.cc
+    DEPS dynamic_loader warpctc phi common)
 else()
   nv_library(
     dynload_cuda
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h
index 93a19645a0a34e..aebdd715b9e1cc 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.h
+++ b/paddle/fluid/platform/dynload/dynamic_loader.h
@@ -47,6 +47,7 @@ void* GetMKLRTDsoHandle();
 void* GetROCFFTDsoHandle();
 void* GetCusparseLtDsoHandle();
 void* GetXPTIDsoHandle();
+void* GetMUFFTDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 }  // namespace dynload
diff --git a/paddle/fluid/platform/dynload/mccl.cc b/paddle/fluid/platform/dynload/mccl.cc
new file mode 100644
index 00000000000000..8497d35e2484d2
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mccl.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+// #if NCCL_VERSION_CODE >= 2212
+MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+// #endif
+
+// #if NCCL_VERSION_CODE >= 2304
+MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
+// #endif
+
+// #if NCCL_VERSION_CODE >= 2703
+MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+// #endif
+
+// #if NCCL_VERSION_CODE >= 21100
+MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
+// #endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mccl.h b/paddle/fluid/platform/dynload/mccl.h
new file mode 100644
index 00000000000000..0e1eac41691a58
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mccl.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <mccl.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/mccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)      \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+MCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast);
+MCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
+MCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(mcclSend);                               \
+  __macro(mcclRecv);
+MCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
+  __macro(mcclRedOpCreatePreMulSum);                \
+  __macro(mcclRedOpDestroy);
+MCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mublas.cc b/paddle/fluid/platform/dynload/mublas.cc
new file mode 100644
index 00000000000000..0ca4c6c3dac999
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mublas.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2
+MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3
+MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
+#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4
+MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mublas.h b/paddle/fluid/platform/dynload/mublas.h
new file mode 100644
index 00000000000000..0b7d21a4ecb76f
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mublas.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mublas.h>
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/mublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)    \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+
+MUBLAS_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+
+MUBLAS_BLAS_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+
+MUBLAS_BLAS_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+
+MUBLAS_BLAS_ROUTINE_EACH_R4(PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mudnn.cc b/paddle/fluid/platform/dynload/mudnn.cc
new file mode 100644
index 00000000000000..8b6ee172e14556
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mudnn.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mudnn.h"
+
+#include "paddle/phi/backends/dynload/mudnn.h"
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+// MUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDNN() { return phi::dynload::HasCUDNN(); }
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mudnn.h b/paddle/fluid/platform/dynload/mudnn.h
new file mode 100644
index 00000000000000..f980972538a0e4
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mudnn.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MUSA
+#include <glog/logging.h>
+#include <mudnn.h>
+#include "paddle/phi/backends/dynload/mudnn.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+using ::musa::dnn::BatchNorm;
+using ::musa::dnn::Convolution;
+using ::musa::dnn::Handle;
+using ::musa::dnn::MemoryHandler;
+using ::musa::dnn::Pooling;
+using ::musa::dnn::Softmax;
+using ::musa::dnn::Tensor;
+
+extern bool HasCUDNN();
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/dynload/mufft.cc b/paddle/fluid/platform/dynload/mufft.cc
new file mode 100644
index 00000000000000..1126ab516619c7
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mufft.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/mufft.h"
+
+#include "paddle/phi/backends/dynload/mufft.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/mufft.h b/paddle/fluid/platform/dynload/mufft.h
new file mode 100644
index 00000000000000..31452acd9d817f
--- /dev/null
+++ b/paddle/fluid/platform/dynload/mufft.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MUSA
+#include <mufft.h>
+#include <mufftXt.h>
+#include <glog/logging.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/mufft.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name)     \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+/**
+ * include all needed cufft functions in HPPL
+ * different cufft version has different interfaces
+ **/
+#define MUFFT_FFT_ROUTINE_EACH(__macro)  \
+  __macro(mufftPlan1d);                  \
+  __macro(mufftPlan2d);                  \
+  __macro(mufftPlan3d);                  \
+  __macro(mufftPlanMany);                \
+  __macro(mufftMakePlan1d);              \
+  __macro(mufftMakePlan2d);              \
+  __macro(mufftMakePlan3d);              \
+  __macro(mufftMakePlanMany);            \
+  __macro(mufftEstimate1d);              \
+  __macro(mufftEstimate2d);              \
+  __macro(mufftEstimate3d);              \
+  __macro(mufftEstimateMany);            \
+  __macro(mufftCreate);                  \
+  __macro(mufftGetSize1d);               \
+  __macro(mufftGetSize2d);               \
+  __macro(mufftGetSize3d);               \
+  __macro(mufftGetSizeMany);             \
+  __macro(mufftGetSize);                 \
+  __macro(mufftSetWorkArea);             \
+  __macro(mufftSetAutoAllocation);       \
+  __macro(mufftExecC2C);                 \
+  __macro(mufftExecR2C);                 \
+  __macro(mufftExecC2R);                 \
+  __macro(mufftExecZ2Z);                 \
+  __macro(mufftExecD2Z);                 \
+  __macro(mufftExecZ2D);                 \
+  __macro(mufftSetStream);               \
+  __macro(mufftDestroy);                 \
+  __macro(mufftGetVersion);              \
+  __macro(mufftGetProperty);             \
+  __macro(mufftXtSetGPUs);               \
+  __macro(mufftXtMalloc);                \
+  __macro(mufftXtMemcpy);                \
+  __macro(mufftXtFree);                  \
+  __macro(mufftXtExecDescriptorC2C);     \
+  __macro(mufftXtExecDescriptorR2C);     \
+  __macro(mufftXtExecDescriptorC2R);     \
+  __macro(mufftXtExecDescriptorZ2Z);     \
+  __macro(mufftXtExecDescriptorD2Z);     \
+  __macro(mufftXtExecDescriptorZ2D);     \
+  __macro(mufftXtQueryPlan);             \
+  __macro(mufftXtSetCallback);           \
+  __macro(mufftXtClearCallback);         \
+  __macro(mufftXtMakePlanMany);          \
+  __macro(mufftXtGetSizeMany);           \
+  __macro(mufftXtExec);                  \
+  __macro(mufftXtExecDescriptor);        
+
+MUFFT_FFT_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUFFT_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/dynload/murand.cc b/paddle/fluid/platform/dynload/murand.cc
new file mode 100644
index 00000000000000..82b911ead32715
--- /dev/null
+++ b/paddle/fluid/platform/dynload/murand.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/murand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/murand.h b/paddle/fluid/platform/dynload/murand.h
new file mode 100644
index 00000000000000..b20a49a7043846
--- /dev/null
+++ b/paddle/fluid/platform/dynload/murand.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <murand.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/murand.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)    \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#define MURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(murandCreateGenerator);              \
+  __macro(murandSetStream);                    \
+  __macro(murandSetPseudoRandomGeneratorSeed); \
+  __macro(murandGenerateUniform);              \
+  __macro(murandGenerateUniformDouble);        \
+  __macro(murandGenerateNormal);               \
+  __macro(murandDestroyGenerator);
+
+MURAND_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musa_driver.cc b/paddle/fluid/platform/dynload/musa_driver.cc
new file mode 100644
index 00000000000000..8898bd4dfb654a
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musa_driver.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/musa_driver.h"
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSA_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() { return phi::dynload::HasCUDADriver(); }
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musa_driver.h b/paddle/fluid/platform/dynload/musa_driver.h
new file mode 100644
index 00000000000000..261841e8e73845
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musa_driver.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern bool HasCUDADriver();
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)      \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+/**
+ * include all needed musa driver functions
+ **/
+#define PLATFORM_MUSA_ROUTINE_EACH(__macro)             \
+  __macro(muInit);                                      \
+  __macro(muDriverGetVersion);                          \
+  __macro(muGetErrorString);                            \
+  __macro(muModuleLoadData);                            \
+  __macro(muModuleGetFunction);                         \
+  __macro(muModuleUnload);                              \
+  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(muLaunchKernel);                              \
+  __macro(muCtxCreate);                                 \
+  __macro(muCtxGetCurrent);                             \
+  __macro(muDeviceGetCount);                            \
+  __macro(muDevicePrimaryCtxGetState);                  \
+  __macro(muDeviceGetAttribute);                        \
+  __macro(muDeviceGet)
+
+PLATFORM_MUSA_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSA_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musartc.cc b/paddle/fluid/platform/dynload/musartc.cc
new file mode 100644
index 00000000000000..4e15dab9c1359d
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musartc.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/musartc.h"
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() { return phi::dynload::HasNVRTC(); }
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musartc.h b/paddle/fluid/platform/dynload/musartc.h
new file mode 100644
index 00000000000000..fca957131ef4ee
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musartc.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern bool HasNVRTC();
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)     \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+/**
+ * include all needed musartc functions
+ **/
+#define MUSARTC_ROUTINE_EACH(__macro) \
+  __macro(mtrtcVersion);              \
+  __macro(mtrtcGetErrorString);       \
+  __macro(mtrtcCompileProgram);       \
+  __macro(mtrtcCreateProgram);        \
+  __macro(mtrtcDestroyProgram);       \
+  __macro(mtrtcGetMUSA);              \
+  __macro(mtrtcGetMUSASize);          \
+  __macro(mtrtcGetProgramLog);        \
+  __macro(mtrtcGetProgramLogSize)
+
+MUSARTC_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/musparse.cc b/paddle/fluid/platform/dynload/musparse.cc
new file mode 100644
index 00000000000000..347059362bc8db
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musparse.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/dynload/musparse.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+#ifdef MUSPARSE_ROUTINE_EACH
+MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
diff --git a/paddle/fluid/platform/dynload/musparse.h b/paddle/fluid/platform/dynload/musparse.h
new file mode 100644
index 00000000000000..586decb9c55c19
--- /dev/null
+++ b/paddle/fluid/platform/dynload/musparse.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <musa.h>
+#include <musparse.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/musparse.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+#define PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)  \
+  using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_MUSA)
+
+
+MUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
+#endif  // PADDLE_WITH_MUSA
+
+#undef PLATFORM_DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
+
diff --git a/paddle/fluid/platform/dynload/nccl.cc b/paddle/fluid/platform/dynload/nccl.cc
index 7b0ea3bb7f3c1f..2cf04248687f27 100644
--- a/paddle/fluid/platform/dynload/nccl.cc
+++ b/paddle/fluid/platform/dynload/nccl.cc
@@ -22,21 +22,21 @@ namespace dynload {
 
 NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-#if NCCL_VERSION_CODE >= 2212
+// #if NCCL_VERSION_CODE >= 2212
 NCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 2304
+// #if NCCL_VERSION_CODE >= 2304
 NCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 2703
+// #if NCCL_VERSION_CODE >= 2703
 NCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 21100
+// #if NCCL_VERSION_CODE >= 21100
 NCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
-#endif
+// #endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h
index d9516c9f4de4e8..d2150204b8810a 100644
--- a/paddle/fluid/platform/dynload/nccl.h
+++ b/paddle/fluid/platform/dynload/nccl.h
@@ -29,18 +29,18 @@ namespace dynload {
 
 #define NCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(ncclGetUniqueId);             \
+  __macro(mcclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommAbort);               \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(ncclAllReduce);               \
-  __macro(ncclBcast);                   \
-  __macro(ncclAllGather);               \
-  __macro(ncclGroupStart);              \
-  __macro(ncclGroupEnd);                \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclCommGetAsyncError);       \
@@ -48,29 +48,29 @@ namespace dynload {
 
 NCCL_RAND_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 
-#if NCCL_VERSION_CODE >= 2212
+// #if NCCL_VERSION_CODE >= 2212
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(ncclBroadcast);
 NCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 2304
-#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
+// #if NCCL_VERSION_CODE >= 2304
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
 NCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 2703
+// #if NCCL_VERSION_CODE >= 2703
 #define NCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
   __macro(ncclSend);                               \
   __macro(ncclRecv);
 NCCL_RAND_ROUTINE_EACH_AFTER_2703(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 21100
+// #if NCCL_VERSION_CODE >= 21100
 #define NCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
   __macro(ncclRedOpCreatePreMulSum);                \
   __macro(ncclRedOpDestroy);
 NCCL_RAND_ROUTINE_EACH_AFTER_21100(PLATFORM_DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
-#endif
+// #endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/rccl.cc b/paddle/fluid/platform/dynload/rccl.cc
index 62bb6a88af7c0a..512a8fbafe6f61 100644
--- a/paddle/fluid/platform/dynload/rccl.cc
+++ b/paddle/fluid/platform/dynload/rccl.cc
@@ -22,21 +22,21 @@ namespace dynload {
 
 RCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-#if NCCL_VERSION_CODE >= 2212
+// #if NCCL_VERSION_CODE >= 2212
 RCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 2304
+// #if NCCL_VERSION_CODE >= 2304
 RCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 2703
+// #if NCCL_VERSION_CODE >= 2703
 RCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
-#endif
+// #endif
 
-#if NCCL_VERSION_CODE >= 21100
+// #if NCCL_VERSION_CODE >= 21100
 RCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
-#endif
+// #endif
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/platform/dynload/rccl.h b/paddle/fluid/platform/dynload/rccl.h
index 4d988e4fb08a08..cba083334ce5c1 100644
--- a/paddle/fluid/platform/dynload/rccl.h
+++ b/paddle/fluid/platform/dynload/rccl.h
@@ -29,17 +29,17 @@ namespace dynload {
 
 #define RCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(ncclGetUniqueId);             \
+  __macro(mcclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(ncclAllReduce);               \
-  __macro(ncclBcast);                   \
-  __macro(ncclAllGather);               \
-  __macro(ncclGroupStart);              \
-  __macro(ncclGroupEnd);                \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclGetErrorString);
@@ -52,7 +52,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2304
-#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
 RCCL_RAND_ROUTINE_EACH_AFTER_2304(PLATFORM_DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 1a82b05f3bc3af..8dab0df5007822 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -38,6 +38,16 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <mublas.h>
+#include <mudnn.h>
+#include <mufft.h>
+#include <murand.h>
+#include <musparse.h>
+#include <thrust/system/musa/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_WITH_CUDA
+
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
@@ -81,6 +91,20 @@ limitations under the License. */
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
+
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/murand.h"
+// #include "paddle/phi/backends/dynload/musolver.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include <error.h>
+
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // __APPLE__
+#endif  // PADDLE_WITH_MUSA
+
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hipfft.h"
 #include "paddle/phi/backends/dynload/hiprand.h"
@@ -98,7 +122,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/phi/core/enforce.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #endif
 #include "paddle/phi/core/flags.h"
diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc
index 690580d8f9c5de..d3148257ea6dea 100644
--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -391,7 +391,7 @@ TEST(enforce, hip_success) {
   EXPECT_TRUE(CheckCudaStatusFailure(HIPFFT_ALLOC_FAILED, "HIPFFT error"));
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
+  EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "Rccl error"));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "Rccl error"));
 #endif
@@ -498,7 +498,7 @@ TEST(enforce, cuda_success) {
   EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error"));
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-  EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess));
+  EXPECT_TRUE(CheckCudaStatusSuccess(mcclSuccess));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error"));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError, "NCCL error"));
   EXPECT_TRUE(CheckCudaStatusFailure(ncclInternalError,
diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h
index e807a54fdee2d7..68a7a2e462aa7c 100644
--- a/paddle/fluid/platform/event.h
+++ b/paddle/fluid/platform/event.h
@@ -21,6 +21,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc
index a77e396adee5f4..6bcf6a368331fa 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.cc
+++ b/paddle/fluid/platform/gen_comm_id_helper.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
 
@@ -500,8 +500,8 @@ SocketServer& SocketServer::GetInstance(const std::string& end_point) {
                                           std::vector<Type>* nccl_ids,      \
                                           int ring_id = 0);
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-INSTANT_TEMPLATE(ncclUniqueId)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+INSTANT_TEMPLATE(mcclUniqueId)
 #endif
 #ifdef PADDLE_WITH_XPU_BKCL
 INSTANT_TEMPLATE(BKCLUniqueId)
diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h
index d97b41311995e1..0d975d84093cfd 100644
--- a/paddle/fluid/platform/gen_comm_id_helper.h
+++ b/paddle/fluid/platform/gen_comm_id_helper.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 #include <functional>
 #include <memory>
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index a3fff528f7903e..c07772e1a1afc6 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/string/split.h"
 #include "paddle/phi/backends/cpu/cpu_info.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
@@ -57,8 +57,8 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/custom_kernel.h"
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
 #include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
 #endif
 
@@ -169,7 +169,7 @@ void InitDevices() {
 #endif
     /*Init all available devices by default */
     std::vector<int> devices;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     try {
       // use user specified GPUs in single-node multi-process mode.
       devices = platform::GetSelectedDevices();
@@ -209,7 +209,7 @@ void InitDevices(const std::vector<int> devices) {
       continue;
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     places.emplace_back(platform::CUDAPlace(device));
 #endif
 #ifdef PADDLE_WITH_XPU
@@ -220,7 +220,7 @@ void InitDevices(const std::vector<int> devices) {
 #endif
   }
   places.emplace_back(platform::CPUPlace());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   places.emplace_back(platform::CUDAPinnedPlace());
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -431,19 +431,19 @@ void InitMemoryMethod() {
     memory_method->allocation_deleter =
         paddle::memory::allocation::Allocator::AllocationDeleter;
 #if defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_CUDA) || \
-    defined(PADDLE_WITH_HIP)
+    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     memory_method->copy_with_stream =
         paddle::memory::Copy<phi::Place, phi::Place>;
 #endif
     memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
     memory_method->device_memory_stat_current_value =
         paddle::memory::DeviceMemoryStatCurrentValue;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
 #endif
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
     // TODO(GhostScreaming): Use phi methods later.
     memory_method->get_allocator =
         [](int device_id, phi::gpuStream_t stream) -> phi::Allocator * {
diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h
index 3d215435881cfe..b0bc0a111cdd23 100644
--- a/paddle/fluid/platform/place.h
+++ b/paddle/fluid/platform/place.h
@@ -58,7 +58,7 @@ typename Visitor::result_type VisitPlace(const Place &place,
                                          const Visitor &visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::CUDAPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -68,7 +68,7 @@ typename Visitor::result_type VisitPlace(const Place &place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       platform::CUDAPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 44c17c32fa8d56..1ed73672f0e3e5 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -685,7 +685,7 @@ void EnableProfiler(ProfilerState state) {
   HostTraceLevel::GetInstance().SetLevel(option.trace_level);
   should_send_profile_state = true;
   phi::GetDeviceTracer()->Enable();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (phi::ProfilerHelper::g_state == ProfilerState::kCUDA ||
       phi::ProfilerHelper::g_state == ProfilerState::kAll ||
       phi::ProfilerHelper::g_state == ProfilerState::kCPU) {
diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu
index 5d1caffd45326d..84a20f8bf7d3c1 100644
--- a/paddle/fluid/platform/profiler.cu
+++ b/paddle/fluid/platform/profiler.cu
@@ -16,6 +16,11 @@ limitations under the License. */
 #include <cuda.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -52,6 +57,20 @@ void DummyKernelAndEvent() {
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(ptr));
     });
   }
+#elif defined(PADDLE_WITH_MUSA)
+  for (int i = 0; i < 5; i++) {
+    ForEachDevice([](int d) {
+      platform::SetDeviceId(d);
+      musaStream_t stream;
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreate(&stream));
+      Mark("_musa_startup_");
+      int *ptr;
+      PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&ptr, sizeof(int)));
+      DummyKernel<<<1, 1, 0, stream>>>(ptr);
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr));
+    });
+  }
 #else
   for (int i = 0; i < 5; i++) {
     ForEachDevice([](int d) {
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index 4d6bc9cc242d47..89c78f01ac4872 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/mem_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 
@@ -198,7 +198,7 @@ std::string OpName(const framework::VariableNameMap& name_map,
                    const std::string& type_name);
 void SetTracerOption(TracerOption option);
 platform::TracerOption GetTracerOption();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void DummyKernelAndEvent();
 #endif
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index de8fd01a1e59de..e67b0fbc3c68db 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -552,7 +552,7 @@ void ChromeTracingLogger::LogMetaInfo(const std::string& version,
                                        span_indx);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void ChromeTracingLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   // add device property information
@@ -664,6 +664,44 @@ void ChromeTracingLogger::LogDeviceProperty(
     device_nums -= 1;
   }
 #endif
+#if defined(PADDLE_WITH_MUSA)
+  for (auto it = device_property_map.begin(); it != device_property_map.end();
+       it++) {
+    const gpuDeviceProp& device_property = it->second;
+    if (device_nums > 1) {
+      output_file_stream_ << string_format(std::string(
+                                               R"JSON(
+    {
+      "id": %u, "name": "%s", "totalGlobalMem": %llu,
+      "computeMajor": %d, "computeMinor": %d,
+      "smCount": %d
+    },
+  )JSON"),
+                                           it->first,
+                                           device_property.name,
+                                           device_property.totalGlobalMem,
+                                           device_property.major,
+                                           device_property.minor,
+                                           device_property.multiProcessorCount);
+    } else {
+      output_file_stream_ << string_format(std::string(
+                                               R"JSON(
+      {
+        "id": %u, "name": "%s", "totalGlobalMem": %llu,
+        "computeMajor": %d, "computeMinor": %d,
+        "smCount": %d
+      }],
+    )JSON"),
+                                           it->first,
+                                           device_property.name,
+                                           device_property.totalGlobalMem,
+                                           device_property.major,
+                                           device_property.minor,
+                                           device_property.multiProcessorCount);
+    }
+    device_nums -= 1;
+  }
+#endif
 }
 #endif
 
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.h b/paddle/fluid/platform/profiler/chrometracing_logger.h
index 37323d1450bf2d..e0cf523ea53eea 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.h
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.h
@@ -41,7 +41,7 @@ class ChromeTracingLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
index 1fce7edc3e329e..c2020acf35d25a 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.cc
@@ -129,7 +129,7 @@ std::unique_ptr<ProfilerResult> DeserializationReader::Parse() {
   // restore NodeTrees object
   std::unique_ptr<NodeTrees> tree(new NodeTrees(thread_event_trees_map));
 // restore gpuDeviceProp
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   for (auto indx = 0; indx < node_trees_proto_->device_property_size();
        indx++) {
@@ -155,7 +155,7 @@ DeserializationReader::~DeserializationReader() {  // NOLINT
   input_file_stream_.close();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 gpuDeviceProp DeserializationReader::RestoreDeviceProperty(
     const DevicePropertyProto& device_property_proto) {
   gpuDeviceProp device_property;
diff --git a/paddle/fluid/platform/profiler/dump/deserialization_reader.h b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
index 5f99f6fd82c55d..c8ac33c5bea49b 100644
--- a/paddle/fluid/platform/profiler/dump/deserialization_reader.h
+++ b/paddle/fluid/platform/profiler/dump/deserialization_reader.h
@@ -39,7 +39,7 @@ class DeserializationReader {
   MemTraceEventNode* RestoreMemTraceEventNode(const MemTraceEventNodeProto&);
   OperatorSupplementEventNode* RestoreOperatorSupplementEventNode(
       const OperatorSupplementEventNodeProto&);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   gpuDeviceProp RestoreDeviceProperty(const DevicePropertyProto&);
 #endif
 
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.cc b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
index 6f4ed06de9e8ec..9b5b2636db30bb 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.cc
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.cc
@@ -40,7 +40,7 @@ void SerializationLogger::OpenFile() {
   node_trees_proto_ = new NodeTreesProto();
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void SerializationLogger::LogDeviceProperty(
     const std::map<uint32_t, gpuDeviceProp>& device_property_map) {
   for (const auto& item : device_property_map) {
diff --git a/paddle/fluid/platform/profiler/dump/serialization_logger.h b/paddle/fluid/platform/profiler/dump/serialization_logger.h
index 80d5413106dedc..67eafdf44e3cd1 100644
--- a/paddle/fluid/platform/profiler/dump/serialization_logger.h
+++ b/paddle/fluid/platform/profiler/dump/serialization_logger.h
@@ -37,7 +37,7 @@ class SerializationLogger : public BaseLogger {
   void LogNodeTrees(const NodeTrees&) override;
   void LogExtraInfo(const std::unordered_map<std::string, std::string>);
   void LogMemTraceEventNode(const MemTraceEventNode&) override;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void LogDeviceProperty(
       const std::map<uint32_t, gpuDeviceProp>& device_property_map);
 #endif
diff --git a/paddle/fluid/platform/profiler/event_python.cc b/paddle/fluid/platform/profiler/event_python.cc
index c01b4abcfbbd3d..4ea1b756a458cd 100644
--- a/paddle/fluid/platform/profiler/event_python.cc
+++ b/paddle/fluid/platform/profiler/event_python.cc
@@ -130,7 +130,7 @@ HostPythonNode* ProfilerResult::CopyTree(HostTraceEventNode* root) {
   return host_python_node;
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 ProfilerResult::ProfilerResult(
     std::unique_ptr<NodeTrees> tree,
     const ExtraInfo& extra_info,
@@ -170,7 +170,7 @@ void ProfilerResult::Save(const std::string& file_name,
   if (format == std::string("json")) {
     ChromeTracingLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
@@ -178,7 +178,7 @@ void ProfilerResult::Save(const std::string& file_name,
   } else if (format == std::string("pb")) {
     SerializationLogger logger(file_name);
     logger.LogMetaInfo(version_, span_indx_);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     logger.LogDeviceProperty(device_property_map_);
 #endif
     tree_->LogMe(&logger);
diff --git a/paddle/fluid/platform/profiler/event_python.h b/paddle/fluid/platform/profiler/event_python.h
index dae32a1902834e..f1d217674bf6c6 100644
--- a/paddle/fluid/platform/profiler/event_python.h
+++ b/paddle/fluid/platform/profiler/event_python.h
@@ -138,7 +138,7 @@ struct HostPythonNode {
 class ProfilerResult {
  public:
   ProfilerResult() : tree_(nullptr) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   explicit ProfilerResult(
       std::unique_ptr<NodeTrees> tree,
       const ExtraInfo& extra_info,
@@ -166,7 +166,7 @@ class ProfilerResult {
 
   std::string GetVersion() { return version_; }
   uint32_t GetSpanIndx() { return span_indx_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> GetDeviceProperty() {
     return device_property_map_;
   }
@@ -176,7 +176,7 @@ class ProfilerResult {
   std::map<uint64_t, HostPythonNode*> thread_event_trees_map_;
   std::shared_ptr<NodeTrees> tree_;
   ExtraInfo extra_info_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map_;
 #endif
   std::string version_;
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index bcb35f5b7bd352..2bb7731b0c1599 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -18,10 +18,16 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/enforce.h"
@@ -47,6 +53,9 @@ void SynchronizeDevice() {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
 #endif
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   auto dev_types = phi::DeviceManager::GetAllCustomDeviceTypes();
   for (const auto& dev_type : dev_types) {
@@ -162,7 +171,7 @@ std::unique_ptr<ProfilerResult> Profiler::Stop() {
                            std::string("%s"),
                            kv.second.c_str());
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   std::map<uint32_t, gpuDeviceProp> device_property_map;
   std::vector<int32_t> device_ids = GetSelectedDevices();
   for (auto device_id : device_ids) {
diff --git a/paddle/fluid/platform/profiler/profiler_test.cc b/paddle/fluid/platform/profiler/profiler_test.cc
index f7f888d9e67396..86243e9258dd62 100644
--- a/paddle/fluid/platform/profiler/profiler_test.cc
+++ b/paddle/fluid/platform/profiler/profiler_test.cc
@@ -23,6 +23,9 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_python.h"
@@ -80,6 +83,11 @@ TEST(ProfilerTest, TestCudaTracer) {
   hipStream_t stream;
   hipStreamCreate(&stream);
   hipStreamSynchronize(stream);
+#endif
+#ifdef PADDLE_WITH_MUSA
+  musaStream_t stream;
+  musaStreamCreate(&stream);
+  musaStreamSynchronize(stream);
 #endif
   auto profiler_result = profiler->Stop();
   auto nodetree = profiler_result->GetNodeTrees();
diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h
index 9835e7525c51ef..e1720874e1489c 100644
--- a/paddle/fluid/platform/profiler_helper.h
+++ b/paddle/fluid/platform/profiler_helper.h
@@ -34,6 +34,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <musa.h>
+#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
 #include "paddle/phi/backends/device_manager.h"
 #endif
@@ -103,6 +107,17 @@ void SynchronizeAllDevice() {
   }
   SetDeviceId(pre_device_id);
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+  int pre_device_id = GetCurrentDeviceId();
+  int count = GetGPUDeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
+  }
+  SetDeviceId(pre_device_id);
+#endif
+
 #ifdef PADDLE_WITH_HIP
   int pre_device_id = GetCurrentDeviceId();
   int count = GetGPUDeviceCount();
@@ -141,7 +156,7 @@ void PrintMemProfiler(
             << "    Memory Profiling Report     "
             << "<-------------------------\n\n";
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   int num_gpus = GetGPUDeviceCount();
   std::cout.setf(std::ios::left);
   if (num_gpus > 0) {
@@ -343,7 +358,7 @@ void SetEvent(bool merge_thread,
     if (rit != pushed_events->rend()) {
       double event_time = 0;
       double gpu_time = 0.0f;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gpu_time = rit->CudaElapsedMs(analyze_event);
 #endif
       double cpu_time = rit->CpuElapsedMs(analyze_event);
diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc
index c55bcb71a7d432..97ca34c0209d39 100644
--- a/paddle/fluid/platform/stream_callback_manager.cc
+++ b/paddle/fluid/platform/stream_callback_manager.cc
@@ -24,6 +24,11 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void *user_data)
 #endif
+#ifdef PADDLE_WITH_MUSA
+static void StreamCallbackFunc(gpuStream_t stream,
+                               gpuError_t status,
+                               void *user_data)
+#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     static void CUDART_CB StreamCallbackFunc(void *user_data)
@@ -58,6 +63,11 @@ void StreamCallbackManager<Stream>::AddCallback(
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
 #endif
+#ifdef PADDLE_WITH_MUSA
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaStreamAddCallback(stream_, StreamCallbackFunc, func, 0));
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -71,7 +81,7 @@ void StreamCallbackManager<Stream>::AddCallback(
 
 template <typename Stream>
 void StreamCallbackManager<Stream>::Wait() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA)
   platform::GpuStreamSync(stream_);
 #endif
   {
@@ -88,5 +98,8 @@ template struct StreamCallbackManager<gpuStream_t>;
 #ifdef PADDLE_WITH_HIP
 template struct StreamCallbackManager<hipStream_t>;
 #endif
+#ifdef PADDLE_WITH_MUSA
+template struct StreamCallbackManager<musaStream_t>;
+#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 7cd6930a9d0d0f..1cc0f0e5cf1e9a 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -25,6 +25,11 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <musa.h>
+#endif
+
 #include <functional>
 #include <future>  // NOLINT
 #include <memory>
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4f761aa3c8536d..8a70396bddee6e 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -67,7 +67,7 @@ if(WITH_RPC)
   set(PYBIND_DEPS ${PYBIND_DEPS} paddle_rpc ${EXTERNAL_BRPC_DEPS} zlib phi
                   common)
 endif()
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda)
   set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard)
 endif()
@@ -79,7 +79,7 @@ if(WITH_IPU)
   set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info)
 endif()
 
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
   set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
 endif()
@@ -99,6 +99,7 @@ if(WITH_CUSTOM_DEVICE)
   if(NOT
      (WITH_NCCL
       OR WITH_RCCL
+      OR WITH_MCCL
       OR WITH_XPU_BKCL))
     set(PYBIND_DEPS ${PYBIND_DEPS} reducer)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
@@ -107,7 +108,7 @@ endif()
 
 if(NOT WIN32)
   set(PYBIND_DEPS ${PYBIND_DEPS} data_loader)
-  if(WITH_NCCL OR WITH_RCCL)
+  if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} nccl_context)
     set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context)
   endif()
@@ -162,7 +163,7 @@ endif()
 
 if(WITH_PYTHON)
   set(PYBIND_DEPS ${PYBIND_DEPS} process_group eager_reducer)
-  if(WITH_NCCL OR WITH_RCCL)
+  if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} process_group_nccl)
   endif()
   if(WITH_XPU_BKCL)
@@ -246,7 +247,7 @@ if(WITH_RPC)
   set(PYBIND_SRCS rpc.cc ${PYBIND_SRCS})
 endif()
 
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   list(APPEND PYBIND_SRCS nccl_wrapper_py.cc)
 endif()
 
@@ -265,7 +266,7 @@ if(WITH_PYTHON)
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB})
   list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS})
 
-  if(WITH_NCCL OR WITH_RCCL)
+  if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
     list(APPEND OP_FUNCTION_GENERETOR_DEPS nccl_context)
   endif()
 
@@ -290,11 +291,16 @@ if(WITH_PYTHON)
     list(REMOVE_ITEM GENERATOR_DEPS python)
   endif()
   target_link_libraries(eager_legacy_op_function_generator ${GENERATOR_DEPS})
-  if(NOT WIN32)
-    add_executable(kernel_signature_generator kernel_signature_generator.cc)
-    target_link_libraries(kernel_signature_generator
-                          ${OP_FUNCTION_GENERETOR_DEPS})
-  endif()
+  # if(NOT WIN32)
+  #   add_executable(kernel_signature_generator kernel_signature_generator.cc)
+  #   if(WITH_MUSA)
+  #     # libtinfo.so depended by libmusa.so is located in '/usr/lib/x86_64-linux-gnu/'
+  #     target_link_options(kernel_signature_generator PRIVATE
+  #                         -Wl,-rpath,/usr/lib/x86_64-linux-gnu/)
+  #   endif()    
+  #   target_link_libraries(kernel_signature_generator
+  #                         ${OP_FUNCTION_GENERETOR_DEPS})
+  # endif()
 
   get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
   target_link_libraries(eager_legacy_op_function_generator
diff --git a/paddle/fluid/pybind/communication.cc b/paddle/fluid/pybind/communication.cc
index 391dbabb1a2109..6351d021dfe8cb 100644
--- a/paddle/fluid/pybind/communication.cc
+++ b/paddle/fluid/pybind/communication.cc
@@ -48,7 +48,7 @@ void BindCommContextManager(py::module *m) {
           .def_static("set_device_id",
                       &phi::distributed::CommContextManager::SetDeviceId,
                       py::call_guard<py::gil_scoped_release>())
-#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)
           .def_static(
               "create_nccl_comm_context",
               &phi::distributed::CommContextManager::CreateNCCLCommContext,
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 2a6c639735a2b4..a07aef2fb69965 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -24,7 +24,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 phi::CUDAStream *get_current_stream(int device_id) {
   if (device_id == -1) {
     device_id = phi::backends::gpu::GetCurrentDeviceId();
@@ -51,7 +51,7 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_get_current_stream",
       [](int deviceId) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         return platform::get_current_stream(deviceId);
 #else
         PADDLE_THROW(
@@ -64,7 +64,7 @@ void BindCudaStream(py::module *m_ptr) {
   m.def(
       "_set_current_stream",
       [](phi::CUDAStream *stream) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         return platform::set_current_stream(stream);
 #else
         PADDLE_THROW(
@@ -75,7 +75,7 @@ void BindCudaStream(py::module *m_ptr) {
       py::return_value_policy::reference);
 
   m.def("_device_synchronize", [](int device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (device_id == -1) {
       device_id = paddle::platform::GetCurrentDeviceId();
     }
@@ -84,6 +84,8 @@ void BindCudaStream(py::module *m_ptr) {
     paddle::platform::SetDeviceId(device_id);
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipDeviceSynchronize());
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize());
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaDeviceSynchronize());
 #endif
@@ -114,7 +116,7 @@ void BindCudaStream(py::module *m_ptr) {
               >>> s3 = paddle.device.cuda.Stream()
 
       )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def(
           "wait_event",
           [](phi::CUDAStream &self, paddle::platform::CudaEvent &event) {
@@ -249,7 +251,7 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, platform::CUDAPlace *place, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -275,7 +277,7 @@ void BindCudaStream(py::module *m_ptr) {
       .def(
           "__init__",
           [](phi::CUDAStream &self, int device, int priority) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             if (priority != 1 && priority != 2) {
               PADDLE_THROW(platform::errors::InvalidArgument(
                   "Priority should be 1(high) or 2(normal) "));
@@ -305,7 +307,7 @@ void BindCudaStream(py::module *m_ptr) {
           py::arg("device") = -1,
           py::arg("priority") = 2)
       .def("__init__", [](phi::CUDAStream &self) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         int device_id = platform::GetCurrentDeviceId();
         auto stream_flag = phi::CUDAStream::StreamFlag::kStreamNonBlocking;
         new (&self) phi::CUDAStream(
@@ -332,7 +334,7 @@ void BindCudaStream(py::module *m_ptr) {
               >>> event = paddle.device.cuda.Event()
 
       )DOC")
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def(
           "record",
           [](paddle::platform::CudaEvent &self, phi::CUDAStream *stream) {
@@ -399,7 +401,7 @@ void BindCudaStream(py::module *m_ptr) {
              bool enable_timing,
              bool blocking,
              bool interprocess) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
             unsigned int flags = platform::GenerateDeviceEventFlag(
                 enable_timing, blocking, interprocess);
             new (&self) paddle::platform::CudaEvent(flags);
diff --git a/paddle/fluid/pybind/cuda_streams_py.h b/paddle/fluid/pybind/cuda_streams_py.h
index d10608a6e8ea96..61f27960e25e9d 100644
--- a/paddle/fluid/pybind/cuda_streams_py.h
+++ b/paddle/fluid/pybind/cuda_streams_py.h
@@ -17,7 +17,7 @@
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #else
 namespace phi {
@@ -29,7 +29,7 @@ namespace py = pybind11;
 
 namespace paddle {
 namespace platform {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 phi::CUDAStream* get_current_stream(int device_id = -1);
 phi::CUDAStream* set_current_stream(phi::CUDAStream* stream);
 #endif
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 4577171fd77bb5..ea61387ae53e51 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -32,7 +32,7 @@ limitations under the License. */
 #include "paddle/phi/api/all.h"
 #include "paddle/phi/core/distributed/types.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/distributed/collective/process_group_nccl.h"
 #endif
 
@@ -1224,7 +1224,7 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>());
 
-#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL)
   py::class_<distributed::ProcessGroupNCCL,
              std::shared_ptr<distributed::ProcessGroupNCCL>>(
       *m, "ProcessGroupNCCL", ProcessGroup)
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 894ede8db18d2b..098c2fa4bdf778 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -322,7 +322,7 @@ void InitTensorWithNumpyValue(TensorObject* self,
 #endif
     SetTensorFromPyArray<platform::XPUPlace>(impl_ptr, array, place, zero_copy);
   } else if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::SetDeviceId(place.device);
     VLOG(4) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
             << " from " << static_cast<int>(place.device);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index df84ca68b9182b..956de0e9d371a0 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -58,7 +58,7 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/pybind/cuda_streams_py.h"
 #endif
 
diff --git a/paddle/fluid/pybind/eager_math_op_patch.cc b/paddle/fluid/pybind/eager_math_op_patch.cc
index 2c01e122914aa4..e932ecb34201c7 100644
--- a/paddle/fluid/pybind/eager_math_op_patch.cc
+++ b/paddle/fluid/pybind/eager_math_op_patch.cc
@@ -139,7 +139,7 @@ std::set<phi::DataType> _complex_dtypes{
 
 void SetDevice(paddle::platform::Place place) {
   if (paddle::platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     phi::backends::gpu::SetDeviceId(place.device);
     VLOG(6) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
             << " from " << static_cast<int>(place.device);
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 584d1b8b58482a..617ed37f6fd816 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -319,11 +319,13 @@ static PyObject* tensor_method_numpy(TensorObject* self,
                            dense_tensor->Holder()->size());
     }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (self->tensor.is_gpu()) {
     eager_gil_scoped_release guard;
 #if defined(PADDLE_WITH_CUDA)
     gpuMemcpyKind kind = cudaMemcpyDeviceToHost;
+#elif defined(PADDLE_WITH_MUSA)
+    gpuMemcpyKind kind = musaMemcpyDeviceToHost;
 #elif defined(PADDLE_WITH_HIP)
     gpuMemcpyKind kind = hipMemcpyDeviceToHost;
     phi::DeviceContextPool::Instance().Get(self->tensor.place())->Wait();
diff --git a/paddle/fluid/pybind/generator_py.cc b/paddle/fluid/pybind/generator_py.cc
index 520fe09bc710cd..05374b08d8fc25 100644
--- a/paddle/fluid/pybind/generator_py.cc
+++ b/paddle/fluid/pybind/generator_py.cc
@@ -40,7 +40,7 @@ void BindGenerator(py::module* m_ptr) {
            [](std::shared_ptr<phi::Generator::GeneratorState>& self) {
              return self->current_seed;
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
       // NOTE(shenliang03): Due to the inability to serialize mt19937_64
       // type, resulting in a problem with precision under the cpu.
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 8ba56008fb2b0b..7199eb13c579bc 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -869,7 +869,7 @@ void BindImperative(py::module *m_ptr) {
       },
       py::call_guard<py::gil_scoped_release>());
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||     \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) ||     \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
   py::class_<imperative::ParallelContext,
@@ -898,7 +898,7 @@ void BindImperative(py::module *m_ptr) {
         py::call_guard<py::gil_scoped_release>());
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   py::class_<imperative::NCCLParallelContext,
              imperative::ParallelContext,
              std::shared_ptr<imperative::NCCLParallelContext>>(
@@ -951,7 +951,7 @@ void BindImperative(py::module *m_ptr) {
            py::arg("ring_id"));
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   py::class_<imperative::HeterParallelContext,
              imperative::ParallelContext,
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 49b5ba774ea3c1..20e644c11919ff 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -43,7 +43,7 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -693,7 +693,7 @@ void BindPaddlePredictor(py::module *m) {
       .def("get_output_names", &PaddlePredictor::GetOutputNames)
       .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
       .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](PaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -743,7 +743,7 @@ void BindNativePredictor(py::module *m) {
       .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
       .def("clone",
            [](NativePaddlePredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](NativePaddlePredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -790,7 +790,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::Exp_DisableMixedPrecisionOps)
       .def("exp_enable_mixed_precision_ops",
            &AnalysisConfig::Exp_EnableMixedPrecisionOps)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("set_exec_stream",
            [](AnalysisConfig &self, phi::CUDAStream &stream) {
              self.SetExecStream(stream.raw_stream());
@@ -1143,7 +1143,7 @@ void BindAnalysisPredictor(py::module *m) {
            &AnalysisPredictor::analysis_argument,
            py::return_value_policy::reference)
       .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](AnalysisPredictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
@@ -1190,7 +1190,7 @@ void BindPaddleInferPredictor(py::module *m) {
            })
       .def("clone",
            [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("clone",
            [](paddle_infer::Predictor &self, phi::CUDAStream &stream) {
              return self.Clone(stream.raw_stream());
diff --git a/paddle/fluid/pybind/parallel_executor.cc b/paddle/fluid/pybind/parallel_executor.cc
index 5b8d169d91f746..f94727e57167f1 100644
--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -126,11 +126,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
diff --git a/paddle/fluid/pybind/place.cc b/paddle/fluid/pybind/place.cc
index 1c4315e8ee1851..9cb6b8f076dcec 100644
--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -126,11 +126,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -319,7 +319,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudaplace
       .def("__init__",
            [](platform::CUDAPlace &self, int dev_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
              if (UNLIKELY(dev_id < 0)) {
                LOG(ERROR) << string::Sprintf(
                    "Invalid CUDAPlace(%d), device id must be 0 or "
@@ -358,7 +358,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
              std::exit(-1);
 #endif
            })
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("get_device_id",
            [](const platform::CUDAPlace &self) { return self.GetDeviceId(); })
       .def("_type", &PlaceIndex<platform::CUDAPlace>)
@@ -373,10 +373,10 @@ void BindPlace(pybind11::module &m) {  // NOLINT
 #endif
       .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 53 support float16
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     return true;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 53;
@@ -384,7 +384,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   });
   m.def("is_bfloat16_supported", [](const platform::CUDAPlace &place) -> bool {
   // Only GPUs with Compute Capability >= 80 support bfloat16
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     return false;
 #else
     return platform::GetGPUComputeCapability(place.device) >= 80;
@@ -546,7 +546,7 @@ void BindPlace(pybind11::module &m) {  // NOLINT
   cudapinnedplace
       .def("__init__",
            [](platform::CUDAPinnedPlace &self) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
              PADDLE_THROW(platform::errors::PermissionDenied(
                  "Cannot use CUDAPinnedPlace in CPU only version, "
                  "Please recompile or reinstall Paddle with CUDA support."));
diff --git a/paddle/fluid/pybind/process_group_utils.h b/paddle/fluid/pybind/process_group_utils.h
index 3ba9ec3239c371..7b9002feed8ed7 100644
--- a/paddle/fluid/pybind/process_group_utils.h
+++ b/paddle/fluid/pybind/process_group_utils.h
@@ -268,7 +268,7 @@ void ConcatTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     ConcatDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                               tensor_list,
                               dense_tensor,
@@ -325,7 +325,7 @@ void SplitTensor(const phi::DeviceContext &dev_ctx,
 
   const auto &place = dev_ctx.GetPlace();
   if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     SplitDenseTensorWithType(static_cast<const phi::GPUContext &>(dev_ctx),
                              tensor,
                              &dense_list,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index feafd1fa4333e6..7949d7c1c33946 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -134,7 +134,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -146,11 +146,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/tensor.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -238,7 +238,7 @@ bool IsCompiledWithAVX() {
 }
 
 bool IsCompiledWithCUDA() {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
   return false;
 #else
   return true;
@@ -279,7 +279,15 @@ bool IsCompiledWithMPIAWARE() {
 }
 
 bool IsCompiledWithROCM() {
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP)
+  return false;
+#else
+  return true;
+#endif
+}
+
+bool IsCompiledWithMUSA() {
+#if !defined(PADDLE_WITH_MUSA) 
   return false;
 #else
   return true;
@@ -675,16 +683,16 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
                         string::join_strings(ops, ',')));
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 static int GetNCCLVersion() {
-#if NCCL_VERSION_CODE >= 2304
+// #if NCCL_VERSION_CODE >= 2304
   int ver;
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetVersion(&ver));
+  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclGetVersion(&ver));
   return ver;
-#else
-  PADDLE_THROW(platform::errors::External(
-      "Cannot get NCCL version successfully when nccl version < 2.3.4"));
-#endif
+// #else
+//   PADDLE_THROW(platform::errors::External(
+//       "Cannot get NCCL version successfully when nccl version < 2.3.4"));
+// #endif
 }
 #endif
 
@@ -930,7 +938,7 @@ PYBIND11_MODULE(libpaddle, m) {
         return self->OutputMeta();
       });
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("cudnn_version", &platform::DnnVersion);
   m.def("gpu_memory_available", []() {
     size_t available = 0;
@@ -940,7 +948,7 @@ PYBIND11_MODULE(libpaddle, m) {
   });
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   m.def("nccl_version", &GetNCCLVersion);
 #endif
 
@@ -982,7 +990,7 @@ PYBIND11_MODULE(libpaddle, m) {
     if (dl.device.device_type == kDLCPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (dl.device.device_type == kDLGPU) {
       paddle::framework::TensorFromDLPack(dmt, &tensor);
     }
@@ -1256,7 +1264,7 @@ All parameter, weight, gradient are variables in Paddle.
           "get_fetch_list",
           [](Variable &self) { return self.GetMutable<FetchList>(); },
           py::return_value_policy::reference)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       .def(
           "get_communicator",
           [](Variable &self) -> platform::Communicator * {
@@ -1724,7 +1732,7 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1758,7 +1766,7 @@ All parameter, weight, gradient are variables in Paddle.
           "create",
           [](paddle::platform::CUDAPinnedPlace &place)
               -> paddle::platform::DeviceContext * {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
             PADDLE_THROW(platform::errors::PermissionDenied(
                 "Cannot use CUDAPinnedPlace in CPU only version, "
                 "Please recompile or reinstall Paddle with CUDA support."));
@@ -1766,7 +1774,7 @@ All parameter, weight, gradient are variables in Paddle.
                   return new paddle::platform::CUDAPinnedDeviceContext(place);
 #endif
           });
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
 #endif
   m.def("get_all_device_type", []() {
@@ -2106,6 +2114,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("is_compiled_with_avx", IsCompiledWithAVX);
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
+  m.def("is_compiled_with_musa", IsCompiledWithMUSA);
   m.def("is_compiled_with_custom_device", IsCompiledWithCustomDevice);
   m.def("is_compiled_with_ipu", IsCompiledWithIPU);
   m.def("is_compiled_with_xpu", IsCompiledWithXPU);
@@ -2384,7 +2393,7 @@ All parameter, weight, gradient are variables in Paddle.
           py::return_value_policy::take_ownership);
 
   m.def("op_support_gpu", OpSupportGPU);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("get_cuda_device_count", platform::GetGPUDeviceCount);
   m.def("get_cuda_current_device_id", &platform::GetCurrentDeviceId);
   m.def("cuda_empty_cache", [] {
@@ -2430,7 +2439,7 @@ All parameter, weight, gradient are variables in Paddle.
         return ostr.str();
       });
 
-#if !defined(PADDLE_WITH_HIP) && !defined(_WIN32)
+#if !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA) && !defined(_WIN32)
   m.def("nvprof_init", platform::CudaProfilerInit);
   m.def("nvprof_start", platform::CudaProfilerStart);
   m.def("nvprof_stop", platform::CudaProfilerStop);
@@ -2512,7 +2521,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("save", &paddle::platform::ProfilerResult::Save)
       .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo)
       .def("get_version", &paddle::platform::ProfilerResult::GetVersion)
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       .def("get_span_indx", &paddle::platform::ProfilerResult::GetSpanIndx)
       .def("get_device_property",
            &paddle::platform::ProfilerResult::GetDeviceProperty);
@@ -2669,7 +2678,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("enable_op_info_recorder", &phi::EnableOpInfoRecorder);
   m.def("disable_op_info_recorder", &phi::DisableOpInfoRecorder);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   m.def("set_cublas_switch", phi::SetAllowTF32Cublas);
   m.def("get_cublas_switch", phi::AllowTF32Cublas);
   m.def("set_cudnn_switch", phi::SetAllowTF32Cudnn);
@@ -2957,7 +2966,7 @@ All parameter, weight, gradient are variables in Paddle.
 #ifdef PADDLE_WITH_BOX_PS
   BindBoxWrapper(&m);
 #endif
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   BindNCCLWrapper(&m);
 #endif
 #ifdef PADDLE_WITH_GLOO
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 44983e3e13df7f..cce09cf7fdfd54 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -117,7 +117,7 @@ limitations under the License. */
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #endif
 #include "paddle/fluid/framework/data_type.h"
@@ -126,11 +126,11 @@ limitations under the License. */
 #include "paddle/fluid/pybind/reader_py.h"
 #include "paddle/fluid/pybind/tensor_py.h"
 #include "paddle/fluid/string/to_string.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)|| defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h"
 #endif
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
 #endif
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
@@ -1101,7 +1101,7 @@ void BindTensor(pybind11::module &m) {  // NOLINT
       .def("height", &phi::SelectedRows::height)
       .def("set_rows",
            [](phi::SelectedRows &self, std::vector<int64_t> rows) {
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
              self.set_rows(rows);
 #else
         std::vector<int64_t> new_rows(rows);
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index dd5bd7f1d91c4d..622d054645eff1 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/pybind/complex.h"
 #include "paddle/phi/kernels/funcs/strided_memcpy.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
@@ -325,7 +325,7 @@ T TensorGetElement(const phi::DenseTensor &self, size_t offset) {
 #endif
   } else if (platform::is_gpu_place(self.place()) ||
              platform::is_cuda_pinned_place(self.place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     const T *a = self.data<T>();
     auto p = self.place();
     paddle::memory::Copy(
@@ -362,7 +362,7 @@ void TensorSetElement(phi::DenseTensor *self, size_t offset, T elem) {
 #endif
   } else if (platform::is_gpu_place(self->place()) ||
              platform::is_cuda_pinned_place(self->place())) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto p = self->place();
     T *a = self->mutable_data<T>(p);
     paddle::memory::Copy(
@@ -457,7 +457,7 @@ void SetTensorFromPyArrayT(
         "Please recompile or reinstall Paddle with CustomDevice support."));
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (paddle::platform::is_gpu_place(place)) {
       // NOTE(wangxi): When copying data to the accelerator card,
       // we need set_device(dev_id) first.
@@ -466,6 +466,9 @@ void SetTensorFromPyArrayT(
 #ifdef PADDLE_WITH_HIP
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      paddle::platform::GpuMemcpySync(
+          dst, array.data(), array.nbytes(), musaMemcpyHostToDevice);
 #else
       paddle::platform::GpuMemcpySync(
           dst, array.data(), array.nbytes(), cudaMemcpyHostToDevice);
@@ -790,7 +793,7 @@ inline phi::DenseTensor *_getTensor(const phi::DenseTensor &self,
     output->mutable_data(place, self.dtype());
 #endif
   } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     if (platform::is_cuda_pinned_place(place)) {
       output->mutable_data(place, self.dtype());
     } else if ((platform::is_gpu_place(place))) {
@@ -1047,11 +1050,13 @@ inline py::array TensorToPyArray(const phi::DenseTensor &tensor,
         "Please recompile or reinstall Paddle with XPU support."));
 #endif
   } else if (is_gpu_tensor) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #if defined(PADDLE_WITH_CUDA)
     gpuMemcpyKind kind = cudaMemcpyDeviceToHost;
 #elif defined(PADDLE_WITH_HIP)
     gpuMemcpyKind kind = hipMemcpyDeviceToHost;
+#elif defined(PADDLE_WITH_MUSA)
+    gpuMemcpyKind kind = musaMemcpyDeviceToHost;
 #endif
     phi::DenseTensor cpu_tensor;
     platform::CPUPlace cpu_place;
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 09b4337ecb40b3..8636de26c4161e 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -51,6 +51,13 @@ if(WITH_GPU)
   list(APPEND PHI_DEPS external_error_proto)
 endif()
 
+if(WITH_MUSA)
+  set(DEPENDENT_LIBRARIES "")
+  list(APPEND DEPENDENT_LIBRARIES "/usr/local/musa/lib/libmudnn.so")
+  list(APPEND PHI_DEPS ${DEPENDENT_LIBRARIES})
+endif()
+
+
 if(WITH_ASCEND_CL)
   list(APPEND PHI_DEPS npu_hccl)
 endif()
@@ -134,11 +141,11 @@ if(WITH_GPU)
     SRCS ${PHI_SRCS}
     DEPS ${PHI_DEPS})
 elseif(WITH_ROCM)
-  hip_library(
-    phi ${PHI_BUILD_TYPE}
-    SRCS ${PHI_SRCS}
-    DEPS ${PHI_DEPS})
-
+  hip_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
+  target_link_libraries(phi ${PHI_DEPS})
+elseif(WITH_MUSA)
+  musa_add_library(phi ${PHI_BUILD_TYPE} ${PHI_SRCS})
+  target_link_libraries(phi ${PHI_DEPS})  
 elseif(WITH_XPU_KP)
   xpu_library(
     phi ${PHI_BUILD_TYPE}
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index 86ba7b9cf75764..a6f8b3949c20a4 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -99,7 +99,7 @@ namespace paddle {
  */
 PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 /**
  * Get the current CUDA stream for the passed CUDA device.
  */
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index a6e78686e1e4ce..3ef838410bed07 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -24,6 +24,11 @@ limitations under the License. */
 using gpuStream_t = cudaStream_t;
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+using gpuStream_t = musaStream_t;
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 using gpuStream_t = hipStream_t;
@@ -413,7 +418,7 @@ class PADDLE_API Tensor final {
    */
   void set_impl(std::shared_ptr<phi::TensorBase>&& impl);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   /**
    * @brief Get the stream where the tensor is currently located
    * This is a deprecated method and may be removed in the future!
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index ed64ff1c937b64..2d5d1a49f02e77 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -385,7 +385,7 @@ void TransStride(phi::DeviceContext* dev_ctx,
       delete from;
       return;
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto* gpu_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     if (gpu_ctx) {
       PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] {
@@ -437,7 +437,7 @@ void TransStrideLegacy(phi::DeviceContext* dev_ctx,
                          }));
       return;
     }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     auto* gpu_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
     if (gpu_ctx) {
       PD_VISIT_ALL_TYPES(to->dtype(), "StridedCopyKernel", ([&] {
@@ -489,7 +489,7 @@ void TransStride(phi::DeviceContext* dev_ctx,
         delete from[i];
         continue;
       }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto* gpu_ctx = dynamic_cast<phi::GPUContext*>(dev_ctx);
       if (gpu_ctx) {
         PD_VISIT_ALL_TYPES(to[i]->dtype(), "StridedCopyKernel", ([&] {
diff --git a/paddle/phi/api/lib/context_pool.cc b/paddle/phi/api/lib/context_pool.cc
index ee1e21a58e2f1b..b2c3f9f28ee79c 100644
--- a/paddle/phi/api/lib/context_pool.cc
+++ b/paddle/phi/api/lib/context_pool.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -75,7 +75,7 @@ PADDLE_API phi::Allocator* GetAllocator(const phi::Place& place) {
   return const_cast<phi::Allocator*>(&dev_ctx->GetAllocator());  // NOLINT
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PADDLE_API phi::CUDAStream* GetCurrentCUDAStream(const phi::Place& place) {
   PADDLE_ENFORCE_EQ(place.GetType(),
                     phi::AllocationType::GPU,
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 03ac68d3319915..2ea7ae4f5e3d84 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -116,7 +116,7 @@ phi::DenseTensor CastDataType(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
                               const phi::DenseTensor& tensor,
                               DataType dtype) {
@@ -158,7 +158,7 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
   if (tensor.place().GetType() == phi::AllocationType::CPU) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
@@ -196,7 +196,7 @@ inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
           << " dst_place: " << dst_place;
 
   auto& pool = phi::DeviceContextPool::Instance();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // NOTE(yy): TransDataPlace should wait for computation of input.
   if (tensor.place().GetType() != phi::AllocationType::GPUPINNED) {
     pool.Get(tensor.place())->Wait();
@@ -247,7 +247,7 @@ phi::DenseTensor Trans2Contiguous(const phi::DenseTensor& tensor) {
   if (tensor.place().GetType() == phi::AllocationType::CPU) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
     return TensorContiguous<phi::CPUContext>(*dev_ctx, tensor);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (tensor.place().GetType() == phi::AllocationType::GPU) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return TensorContiguous<phi::GPUContext>(*dev_ctx, tensor);
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 49c47cbcce363c..ee88e9fb1b0c88 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -376,7 +376,7 @@ void Tensor::set_impl(std::shared_ptr<phi::TensorBase> &&impl) {
   impl_ = std::move(impl);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 gpuStream_t Tensor::stream() const {
   int device_id = phi::backends::gpu::GetCurrentDeviceId();
   auto *gpu_context = DeviceContextPool::Instance().Get<AllocationType::GPU>(
diff --git a/paddle/phi/api/lib/tensor_utils.cc b/paddle/phi/api/lib/tensor_utils.cc
index 9c11e88260c1df..0aad2a6da5fdce 100644
--- a/paddle/phi/api/lib/tensor_utils.cc
+++ b/paddle/phi/api/lib/tensor_utils.cc
@@ -20,11 +20,11 @@ limitations under the License. */
 #include "paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
 #else
-#include <hip/hip_runtime.h>
+#include <musa_runtime.h>
 #endif
 #endif
 
@@ -33,26 +33,26 @@ namespace paddle {
 PD_REGISTER_API(from_blob)
 
 phi::Place GetPlaceFromPtr(void* data) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10000
-  cudaPointerAttributes attr;
-  cudaError_t status = cudaPointerGetAttributes(&attr, data);
-  if (status == cudaSuccess && attr.type == cudaMemoryTypeDevice) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+// #ifdef PADDLE_WITH_CUDA
+// #if CUDA_VERSION >= 10000
+  musaPointerAttributes attr;
+  musaError_t status = musaPointerGetAttributes(&attr, data);
+  if (status == musaSuccess && attr.type == musaMemoryTypeDevice) {
     return phi::GPUPlace(attr.device);
   }
-#else
-  PADDLE_THROW(
-      phi::errors::Unimplemented("The GetPlaceFromPtr() method is only "
-                                 "supported when CUDA version >= 10.0."));
-#endif
-#else
-  hipPointerAttribute_t attr;
-  hipError_t status = hipPointerGetAttributes(&attr, data);
-  if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
-    return phi::GPUPlace(attr.device);
-  }
-#endif
+// #else
+//   PADDLE_THROW(
+//       phi::errors::Unimplemented("The GetPlaceFromPtr() method is only "
+//                                  "supported when CUDA version >= 10.0."));
+// #endif
+// #else
+//   hipPointerAttribute_t attr;
+//   hipError_t status = hipPointerGetAttributes(&attr, data);
+//   if (status == hipSuccess && attr.memoryType == hipMemoryTypeDevice) {
+//     return phi::GPUPlace(attr.device);
+//   }
+// #endif
 #endif
   return phi::CPUPlace();
 }
diff --git a/paddle/phi/api/profiler/event.h b/paddle/phi/api/profiler/event.h
index eb765ebdcb9dd3..70fb4d948986c4 100644
--- a/paddle/phi/api/profiler/event.h
+++ b/paddle/phi/api/profiler/event.h
@@ -27,8 +27,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/core/cuda_stream.h"
 #endif
 
@@ -62,7 +64,7 @@ class Event {
   void set_name(std::string name) { name_ = name; }
   void set_role(EventRole role) { role_ = role; }
   std::string attr() const { return attr_; }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifndef PADDLE_WITH_CUPTI
   gpuEvent_t event() const { return event_; }
   int device() const { return device_; }
@@ -81,7 +83,7 @@ class Event {
   uint64_t cpu_ns_;
   bool visited_status_{false};
   std::string attr_;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_CUPTI
   int64_t gpu_ns_ = 0;
 
@@ -137,12 +139,14 @@ class MemEvent {
 };
 
 class CudaEvent {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
  public:
   CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreateWithFlags(&event_, flags_);
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -152,6 +156,8 @@ class CudaEvent {
   explicit CudaEvent(unsigned int flags) : flags_(flags) {
 #ifdef PADDLE_WITH_HIP
     hipEventCreateWithFlags(&event_, flags_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreateWithFlags(&event_, flags_);    
 #else
     cudaEventCreateWithFlags(&event_, flags_);
 #endif
@@ -161,6 +167,8 @@ class CudaEvent {
   ~CudaEvent() {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(event_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventDestroy(event_);
 #else
     cudaEventDestroy(event_);
 #endif
@@ -169,6 +177,8 @@ class CudaEvent {
   void Record(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(event_, stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, stream));
 #endif
@@ -183,6 +193,14 @@ class CudaEvent {
     if (err == hipErrorNotReady) {
       return false;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    gpuError_t err = musaEventQuery(event_);
+    if (err == musaSuccess) {
+      return true;
+    }
+    if (err == musaErrorNotReady) {
+      return false;
+    }
 #else
     gpuError_t err = cudaEventQuery(event_);
     if (err == cudaSuccess) {
@@ -199,6 +217,8 @@ class CudaEvent {
   void Synchronize() {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventSynchronize(event_));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventSynchronize(event_));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_));
 #endif
@@ -208,6 +228,8 @@ class CudaEvent {
  private:
 #ifdef PADDLE_WITH_HIP
   unsigned int flags_ = hipEventDefault;
+#elif defined(PADDLE_WITH_MUSA)
+  unsigned int flags_ = musaEventDefault;
 #else
   unsigned int flags_ = cudaEventDefault;
 #endif
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index ed47487553bee7..db0d463bc67156 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -7,7 +7,7 @@ if(NOT APPLE AND NOT WIN32)
   list(APPEND BACKENDS_SRCS device_code.cc)
 endif()
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)
   if(WITH_GPU)
@@ -16,6 +16,9 @@ if(WITH_GPU OR WITH_ROCM)
   if(WITH_ROCM)
     list(APPEND BACKENDS_SRCS gpu/rocm/rocm_info.cc)
   endif()
+  if(WITH_MUSA)
+    list(APPEND BACKENDS_SRCS gpu/musa/musa_info.cc)
+  endif()  
 endif()
 
 if(WITH_XPU)
@@ -49,6 +52,7 @@ list(
 
 if(WITH_GPU
    OR WITH_ROCM
+   OR WITH_MUSA
    OR WITH_CUSTOM_DEVICE)
   list(APPEND BACKENDS_SRCS device_base.cc)
 endif()
diff --git a/paddle/phi/backends/context_pool.cc b/paddle/phi/backends/context_pool.cc
index 7824fc3b160b10..9e8ecd48e453c5 100644
--- a/paddle/phi/backends/context_pool.cc
+++ b/paddle/phi/backends/context_pool.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 bool allow_tf32_cublas = true;
 void SetAllowTF32Cublas(bool active) { allow_tf32_cublas = active; }
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
diff --git a/paddle/phi/backends/context_pool.h b/paddle/phi/backends/context_pool.h
index 52f0ced275ac5e..a0537c779e52f7 100644
--- a/paddle/phi/backends/context_pool.h
+++ b/paddle/phi/backends/context_pool.h
@@ -28,7 +28,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void SetAllowTF32Cublas(bool active);
 /*Get the global variable allow_tf32_cublas value*/
 bool AllowTF32Cublas();
@@ -47,7 +47,7 @@ struct DefaultDeviceContextType<phi::CPUPlace> {
   using TYPE = phi::CPUContext;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <>
 struct DefaultDeviceContextType<phi::GPUPlace> {
   using TYPE = phi::GPUContext;
diff --git a/paddle/phi/backends/device_code.cc b/paddle/phi/backends/device_code.cc
index d160b5034f9986..ac16a69aa7bee7 100644
--- a/paddle/phi/backends/device_code.cc
+++ b/paddle/phi/backends/device_code.cc
@@ -78,7 +78,8 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
   }
   for (auto& p : set) {
     if (p.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
       device_codes_.emplace(p, DeviceCodeMap());
 #else
       PADDLE_THROW(phi::errors::PreconditionNotMet(
@@ -88,12 +89,14 @@ DeviceCodePool::DeviceCodePool(const std::vector<phi::Place>& places) {
     }
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
   GPUDeviceCode::CheckAvailableStatus();
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
 static bool CheckCUDADriverResult(hipError_t result,
                                   std::string caller,
@@ -101,6 +104,13 @@ static bool CheckCUDADriverResult(hipError_t result,
   if (result != hipSuccess) {
     const char* error = nullptr;
     error = dynload::hipGetErrorString(result);
+#elif defined(PADDLE_WITH_MUSA)
+static bool CheckCUDADriverResult(MUresult result,
+                                  std::string caller,
+                                  std::string kernel_name = "") {
+  if (result != MUSA_SUCCESS) {
+    const char* error = nullptr;
+    dynload::muGetErrorString(result, &error);
 #else
 static bool CheckCUDADriverResult(CUresult result,
                                   std::string caller,
@@ -130,6 +140,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hiprtcResult nvrtc_result =
       dynload::hiprtcVersion(&nvrtc_major, &nvrtc_minor);
+#elif defined(PADDLE_WITH_MUSA)
+  mtrtcResult nvrtc_result = dynload::mtrtcVersion(&nvrtc_major, &nvrtc_minor);
 #else
   nvrtcResult nvrtc_result = dynload::nvrtcVersion(&nvrtc_major, &nvrtc_minor);
 #endif
@@ -140,6 +152,9 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   hipError_t driver_result = dynload::hipDriverGetVersion(&driver_version);
   if (driver_result == hipSuccess) {
+#elif defined(PADDLE_WITH_MUSA)
+  MUresult driver_result = dynload::muDriverGetVersion(&driver_version);
+  if (driver_result == MUSA_SUCCESS) {
 #else
   CUresult driver_result = dynload::cuDriverGetVersion(&driver_version);
   if (driver_result == CUDA_SUCCESS) {
@@ -153,6 +168,8 @@ void GPUDeviceCode::CheckAvailableStatus() {
                        << "." << nvrtc_minor;
 #ifdef PADDLE_WITH_HIP
   if (nvrtc_result != HIPRTC_SUCCESS || driver_result != hipSuccess) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (nvrtc_result != MTRTC_SUCCESS || driver_result != MUSA_SUCCESS) {
 #else
   if (nvrtc_result != NVRTC_SUCCESS || driver_result != CUDA_SUCCESS) {
 #endif
@@ -163,6 +180,9 @@ void GPUDeviceCode::CheckAvailableStatus() {
 #ifdef PADDLE_WITH_HIP
   if (CheckCUDADriverResult(dynload::hipGetDeviceCount(&count),
                             "hipGetDeviceCount")) {
+#elif defined(PADDLE_WITH_MUSA)
+  if (CheckCUDADriverResult(dynload::muDeviceGetCount(&count),
+                            "muDeviceGetCount")) {
 #else
   if (CheckCUDADriverResult(dynload::cuDeviceGetCount(&count),
                             "cuDeviceGetCount")) {
@@ -202,6 +222,8 @@ static std::string FindCUDAIncludePath() {
 
 #ifdef PADDLE_WITH_HIP
   cuda_include_path = "/opt/rocm/include";
+#elif defined(PADDLE_WITH_MUSA)
+  cuda_include_path = "/usr/local/musa/include";
 #else
   cuda_include_path = "/usr/local/cuda/include";
 #endif
@@ -229,6 +251,8 @@ GPUDeviceCode::GPUDeviceCode(const Place& place,
   name_ = name;
 #ifdef PADDLE_WITH_HIP
   kernel_ = "#include <hip/hip_runtime.h>\n" + kernel;
+#elif defined(PADDLE_WITH_MUSA)
+  kernel_ = kernel;
 #else
   kernel_ = kernel;
 #endif
@@ -257,12 +281,12 @@ bool GPUDeviceCode::Compile(bool include_path) {
   auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
       DeviceContextPool::Instance().Get(place_));
   int compute_capability = dev_ctx->GetComputeCapability();
-  std::vector<const char*> options = {"-std=c++11"};
+  std::vector<const char*> options = {"-std=c++11", "--amdgpu-target=gfx906"};
   std::string include_option;
   if (include_path) {
     std::string cuda_include_path = FindCUDAIncludePath();
     if (!cuda_include_path.empty()) {
-      include_option = "-I" + cuda_include_path;
+      include_option = "--include-path=" + cuda_include_path;
       options.push_back(include_option.c_str());
     }
   }
@@ -318,6 +342,86 @@ bool GPUDeviceCode::Compile(bool include_path) {
           "hipModuleGetFunction")) {
     return false;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  mtrtcProgram program;
+  if (!CheckNVRTCResult(dynload::mtrtcCreateProgram(&program,
+                                                    kernel_.c_str(),  // buffer
+                                                    name_.c_str(),    // name
+                                                    0,         // numHeaders
+                                                    nullptr,   // headers
+                                                    nullptr),  // includeNames
+                        "mtrtcCreateProgram")) {
+    return false;
+  }
+
+  // Compile the program for specified compute_capability
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(
+      DeviceContextPool::Instance().Get(place_));
+  int compute_capability = dev_ctx->GetComputeCapability();
+  std::string compute_flag =
+      "--gpu-architecture=compute_" + std::to_string(compute_capability);
+  std::vector<const char*> options = {"--std=c++11", compute_flag.c_str()};
+  std::string include_option;
+  if (include_path) {
+    std::string cuda_include_path = FindCUDAIncludePath();
+    if (!cuda_include_path.empty()) {
+      include_option = "--include-path=" + cuda_include_path;
+      options.push_back(include_option.c_str());
+    }
+  }
+  mtrtcResult compile_result =
+      dynload::mtrtcCompileProgram(program,          // program
+                                   options.size(),   // numOptions
+                                   options.data());  // options
+  if (compile_result == MTRTC_ERROR_COMPILATION) {
+    // Obtain compilation log from the program
+    size_t log_size;
+    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLogSize(program, &log_size),
+                          "mtrtcGetProgramLogSize")) {
+      return false;
+    }
+    std::vector<char> log;
+    log.resize(log_size + 1);
+    if (!CheckNVRTCResult(dynload::mtrtcGetProgramLog(program, log.data()),
+                          "nvrtcGetProgramLog")) {
+      return false;
+    }
+    LOG(WARNING) << "JIT compiling of MUSA code failed:"
+                 << "\n  Kernel name: " << name_ << "\n  Kernel body:\n"
+                 << kernel_ << "\n  Compiling log: " << log.data();
+
+    return false;
+  }
+
+  // Obtain PTX from the program
+  size_t ptx_size;
+  if (!CheckNVRTCResult(dynload::mtrtcGetMUSASize(program, &ptx_size),
+                        "mtrtcGetMUSASize")) {
+    return false;
+  }
+  ptx_.resize(ptx_size + 1);
+  if (!CheckNVRTCResult(dynload::mtrtcGetMUSA(program, ptx_.data()),
+                        "mtrtcGetMUSA")) {
+    return false;
+  }
+
+  if (!CheckNVRTCResult(dynload::mtrtcDestroyProgram(&program),
+                        "mtrtcDestroyProgram")) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(dynload::muModuleLoadData(&module_, ptx_.data()),
+                             "muModuleLoadData",
+                             name_)) {
+    return false;
+  }
+
+  if (!CheckCUDADriverResult(
+          dynload::muModuleGetFunction(&function_, module_, name_.c_str()),
+          "muModuleGetFunction",
+          name_)) {
+    return false;
+  }
 #else
   nvrtcProgram program;
   if (!CheckNVRTCResult(dynload::nvrtcCreateProgram(&program,
@@ -436,6 +540,22 @@ void GPUDeviceCode::Launch(const size_t n, std::vector<void*>* args) const {
       hipSuccess,
       errors::External("Fail to launch kernel %s (in hipModuleLaunchKernel.)",
                        name_.c_str()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_EQ(
+      dynload::muLaunchKernel(function_,
+                              num_blocks,
+                              1,
+                              1,  // grid dim
+                              num_threads_,
+                              1,
+                              1,                  // block dim
+                              0,                  // shared memory
+                              dev_ctx->stream(),  // stream
+                              args->data(),       // arguments
+                              nullptr),
+      MUSA_SUCCESS,
+      errors::External("Fail to launch kernel %s (in muLaunchKernel.)",
+                       name_.c_str()));
 #else
   PADDLE_ENFORCE_EQ(
       dynload::cuLaunchKernel(function_,
@@ -464,6 +584,18 @@ bool GPUDeviceCode::CheckNVRTCResult(hiprtcResult result,
         << " > failed: " << dynload::hiprtcGetErrorString(result);
     return false;
   }
+  return true;
+}
+#elif defined(PADDLE_WITH_MUSA)
+bool GPUDeviceCode::CheckNVRTCResult(mtrtcResult result, std::string function) {
+  if (result != MTRTC_SUCCESS) {
+    LOG_FIRST_N(WARNING, 1)
+        << "Call " << function << " for < " << name_
+        << " > failed: " << dynload::mtrtcGetErrorString(result);
+    return false;
+  }
+  return true;
+}
 #else
 bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
   if (result != NVRTC_SUCCESS) {
@@ -472,9 +604,9 @@ bool GPUDeviceCode::CheckNVRTCResult(nvrtcResult result, std::string function) {
         << " > failed: " << dynload::nvrtcGetErrorString(result);
     return false;
   }
-#endif
   return true;
 }
 #endif
+#endif
 
 }  // namespace phi
diff --git a/paddle/phi/backends/device_code.h b/paddle/phi/backends/device_code.h
index 8debb4dc9c45ee..964124076e6057 100644
--- a/paddle/phi/backends/device_code.h
+++ b/paddle/phi/backends/device_code.h
@@ -26,11 +26,20 @@ limitations under the License. */
 #include "paddle/phi/backends/dynload/cuda_driver.h"
 #include "paddle/phi/backends/dynload/nvrtc.h"
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/musa_driver.h"
+#include "paddle/phi/backends/dynload/musartc.h"
+#endif
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hiprtc.h"
 #include "paddle/phi/backends/dynload/rocm_driver.h"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+// #include "paddle/phi/backends/dynload/hiprtc.h"
+// #include "paddle/phi/backends/dynload/rocm_driver.h"
+#endif
+
 namespace phi {
 
 class DeviceCode {
@@ -48,7 +57,7 @@ class DeviceCode {
   std::string kernel_;
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 class GPUDeviceCode : public DeviceCode {
  public:
   explicit GPUDeviceCode(const Place& place,
@@ -68,6 +77,8 @@ class GPUDeviceCode : public DeviceCode {
  private:
 #ifdef PADDLE_WITH_HIP
   bool CheckNVRTCResult(hiprtcResult result, std::string function);
+#elif defined(PADDLE_WITH_MUSA)
+  bool CheckNVRTCResult(mtrtcResult result, std::string function);
 #else
   bool CheckNVRTCResult(nvrtcResult result, std::string function);
 #endif
@@ -82,6 +93,9 @@ class GPUDeviceCode : public DeviceCode {
 #ifdef PADDLE_WITH_HIP
   hipModule_t module_;
   hipFunction_t function_;
+#elif defined(PADDLE_WITH_MUSA)
+  MUmodule module_;
+  MUfunction function_;
 #else
   CUmodule module_;
   CUfunction function_;
diff --git a/paddle/phi/backends/device_memory_aligment.h b/paddle/phi/backends/device_memory_aligment.h
index c65e06364acd0e..d731b6b6d1ecf9 100644
--- a/paddle/phi/backends/device_memory_aligment.h
+++ b/paddle/phi/backends/device_memory_aligment.h
@@ -36,7 +36,7 @@ inline size_t Alignment(size_t size,
     if (place.GetType() == phi::AllocationType::CPU) {
       alignment = phi::backends::cpu::CpuMinChunkSize();
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       alignment = phi::backends::gpu::GpuMinChunkSize();
 #elif defined(PADDLE_WITH_XPU)
       alignment = phi::backends::xpu::XPUMinChunkSize();
diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt
index 2db75d7022f0a5..2ea6f11aa53a65 100644
--- a/paddle/phi/backends/dynload/CMakeLists.txt
+++ b/paddle/phi/backends/dynload/CMakeLists.txt
@@ -30,6 +30,17 @@ if(WITH_ROCM)
     rocsparse.cc)
 endif()
 
+if(WITH_MUSA)
+  list(
+    APPEND
+    MUSA_SRCS
+    mublas.cc
+    mudnn.cc
+    murand.cc
+    mufft.cc
+    musparse.cc)
+endif()
+
 # There is no macOS version of NCCL.
 # Disable nvrtc and cuda_driver api on macOS, and only do an early test on Linux and Windows.
 if(NOT APPLE)
@@ -46,6 +57,15 @@ if(NOT APPLE)
       list(APPEND HIP_SRCS cupti.cc)
     endif()
   endif()
+  if(WITH_MUSA)
+    list(APPEND MUSA_SRCS musartc.cc musa_driver.cc)
+    if(WITH_MCCL)
+      list(APPEND MUSA_SRCS mccl.cc)
+    endif()
+    if(CUPTI_FOUND)
+      list(APPEND MUSA_SRCS cupti.cc)
+    endif()
+  endif()
 endif()
 
 if(TENSORRT_FOUND)
@@ -93,6 +113,8 @@ if(WITH_ROCM)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${HIP_SRCS})
 elseif(WITH_GPU)
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${CUDA_SRCS})
+elseif(WITH_MUSA)
+  collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS} ${MUSA_SRCS})  
 else()
   collect_srcs(backends_srcs SRCS ${DYNLOAD_COMMON_SRCS})
 endif()
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index bdb9e120d2884b..987f0eefc4397f 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -102,6 +102,29 @@ PHI_DEFINE_string(rccl_dir,
                   "dlopen will search rccl from LD_LIBRARY_PATH");
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+
+PHI_DEFINE_string(mudnn_dir,
+                  "",
+                  "Specify path for loading libmudnn.so. For instance, "
+                  "/usr/local/musa/lib. If empty [default], dlopen "
+                  "will search mudnn from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(musa_dir,
+                  "",
+                  "Specify path for loading rocm library, such as libmublas, "
+                  "For instance, /usr/local/musa/lib. "
+                  "If default, dlopen will search rocm from LD_LIBRARY_PATH");
+
+PHI_DEFINE_string(mccl_dir,
+                  "",
+                  "Specify path for loading mccl library, such as libmccl.so. "
+                  "For instance, /usr/local/musa/lib. If default, "
+                  "dlopen will search rccl from LD_LIBRARY_PATH");
+#endif
+
+
+
 #ifdef PADDLE_WITH_XPU
 PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
 #endif
@@ -326,6 +349,8 @@ void* GetCublasDsoHandle() {
       FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmublas.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -367,6 +392,9 @@ void* GetCUDNNDsoHandle() {
       FLAGS_cudnn_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_cudnn_dir, "libmudnn.so", false, {cuda_lib_path});
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_cudnn_dir, "libcudnn.so", false, {cuda_lib_path});
@@ -391,6 +419,8 @@ void* GetCurandDsoHandle() {
       FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmurand.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -406,6 +436,12 @@ void* GetROCFFTDsoHandle() {
 }
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+void* GetMUFFTDsoHandle() {
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmufft.so");
+}
+#endif
+
 void* GetNvjpegDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
@@ -436,6 +472,8 @@ void* GetCusparseDsoHandle() {
       FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusparse.so");
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
 #endif
@@ -446,6 +484,8 @@ void* GetNVRTCDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusart.so", false);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so", false);
 #endif
@@ -456,6 +496,8 @@ void* GetCUDADsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
 #elif defined(PADDLE_WITH_HIP)
   return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
+#elif defined(PADDLE_WITH_MUSA)
+  return GetDsoHandleFromSearchPath(FLAGS_musa_dir, "libmusa.so", false);
 #elif defined(_WIN32)
   char system32_dir[MAX_PATH];
   GetSystemDirectory(system32_dir, MAX_PATH);
@@ -513,6 +555,9 @@ void* GetNCCLDsoHandle() {
       "You may need to install 'rccl' from ROCM official website: "
       "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
       "Installation-Guide.html before install PaddlePaddle.");
+#elif defined(PADDLE_WITH_MUSA)
+  std::string warning_msg(
+      "You may need to install 'mccl' from musa official website.");
 #else
   std::string warning_msg(
       "You may need to install 'nccl2' from NVIDIA official website: "
@@ -526,6 +571,9 @@ void* GetNCCLDsoHandle() {
 #elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
   return GetDsoHandleFromSearchPath(
       FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
+#elif defined(PADDLE_WITH_MUSA) && defined(PADDLE_WITH_MCCL)
+  return GetDsoHandleFromSearchPath(
+      FLAGS_mccl_dir, "libmccl.so", true, {}, warning_msg);
 #else
   return GetDsoHandleFromSearchPath(
       FLAGS_nccl_dir, "libnccl.so", true, {}, warning_msg);
diff --git a/paddle/phi/backends/dynload/dynamic_loader.h b/paddle/phi/backends/dynload/dynamic_loader.h
index 6ddeb1386410f0..02da303b2020f9 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.h
+++ b/paddle/phi/backends/dynload/dynamic_loader.h
@@ -48,6 +48,7 @@ void* GetMKLRTDsoHandle();
 void* GetROCFFTDsoHandle();
 void* GetCusparseLtDsoHandle();
 void* GetXPTIDsoHandle();
+void* GetMUFFTDsoHandle();
 
 void SetPaddleLibPath(const std::string&);
 
diff --git a/paddle/phi/backends/dynload/mccl.cc b/paddle/phi/backends/dynload/mccl.cc
new file mode 100644
index 00000000000000..3bf5fd8c985d12
--- /dev/null
+++ b/paddle/phi/backends/dynload/mccl.cc
@@ -0,0 +1,36 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mccl.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag mccl_dso_flag;
+void *mccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+MCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP)
+
+MCCL_RAND_ROUTINE_EACH_AFTER_2304(DEFINE_WRAP)
+
+MCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP)
+
+MCCL_RAND_ROUTINE_EACH_AFTER_21100(DEFINE_WRAP)
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h
new file mode 100644
index 00000000000000..14320ff89f0c05
--- /dev/null
+++ b/paddle/phi/backends/dynload/mccl.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <mccl.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag mccl_dso_flag;
+extern void* mccl_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(&::__name);                     \
+      std::call_once(mccl_dso_flag, []() {                       \
+        mccl_dso_handle = phi::dynload::GetNCCLDsoHandle();      \
+      });                                                        \
+      static void* p_##__name = dlsym(mccl_dso_handle, #__name); \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#define MCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(mcclCommInitAll);             \
+  __macro(mcclGetUniqueId);             \
+  __macro(mcclCommInitRank);            \
+  __macro(mcclCommAbort);               \
+  __macro(mcclCommDestroy);             \
+  __macro(mcclCommCount);               \
+  __macro(mcclCommCuDevice);            \
+  __macro(mcclCommUserRank);            \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
+  __macro(mcclReduce);                  \
+  __macro(mcclReduceScatter);           \
+  __macro(mcclCommGetAsyncError);       \
+  __macro(mcclGetErrorString);
+
+MCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(mcclBroadcast);
+MCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
+MCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \
+  __macro(mcclSend);                               \
+  __macro(mcclRecv);
+MCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+
+#define MCCL_RAND_ROUTINE_EACH_AFTER_21100(__macro) \
+  __macro(mcclRedOpCreatePreMulSum);                \
+  __macro(mcclRedOpDestroy);
+MCCL_RAND_ROUTINE_EACH_AFTER_21100(DECLARE_DYNAMIC_LOAD_MCCL_WRAP)
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mublas.cc b/paddle/phi/backends/dynload/mublas.cc
new file mode 100644
index 00000000000000..fd05d45414b47e
--- /dev/null
+++ b/paddle/phi/backends/dynload/mublas.cc
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mublas.h"
+
+namespace phi {
+namespace dynload {
+std::once_flag mublas_dso_flag;
+void *mublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+#ifdef MUBLAS_BLAS_ROUTINE_EACH_R2
+MUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
+#endif
+
+#ifdef MUBLAS_BLAS_ROUTINE_EACH_R3
+MUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
+#endif
+
+#ifdef MUBLAS_BLAS_ROUTINE_EACH_R4
+MUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
+#endif
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mublas.h b/paddle/phi/backends/dynload/mublas.h
new file mode 100644
index 00000000000000..9f8db31bd2d060
--- /dev/null
+++ b/paddle/phi/backends/dynload/mublas.h
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+
+#include <mublas.h>
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+#include <type_traits>
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag mublas_dso_flag;
+extern void *mublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load mublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP(__name)                            \
+  struct DynLoad__##__name {                                                \
+    template <typename... Args>                                             \
+    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using blas_func =                                                   \
+          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
+      std::call_once(mublas_dso_flag, []() {                                \
+        mublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
+      });                                                                   \
+      static void *p_##__name = dlsym(mublas_dso_handle, #__name);          \
+      return reinterpret_cast<blas_func>(p_##__name)(args...);            \
+    }                                                                       \
+  };                                                                        \
+  extern DynLoad__##__name __name
+
+#define MUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(mublasSaxpy);                \
+  __macro(mublasDaxpy);                \
+  __macro(mublasCaxpy);                \
+  __macro(mublasZaxpy);                \
+  __macro(mublasSscal);                \
+  __macro(mublasDscal);                \
+  __macro(mublasScopy);                \
+  __macro(mublasDcopy);                \
+  __macro(mublasSgemv);                \
+  __macro(mublasDgemv);                \
+  __macro(mublasCgemv);                \
+  __macro(mublasZgemv);                \
+  __macro(mublasSgemm);                \
+  __macro(mublasDgemm);                \
+  __macro(mublasCgemm);                \
+  __macro(mublasZgemm);                \
+  __macro(mublasSgeam);                   \
+  __macro(mublasDgeam);                   \
+  __macro(mublasStrsm);                \
+  __macro(mublasDtrsm);                \
+  __macro(mublasCtrsm);                \
+  __macro(mublasZtrsm);                \
+  __macro(mublasCreate);               \
+  __macro(mublasDestroy);              \
+  __macro(mublasSetStream);            \
+  __macro(mublasSetPointerMode);       \
+  __macro(mublasGetPointerMode);       \
+  __macro(mublasSgemmBatched);            \
+  __macro(mublasDgemmBatched);            \
+  __macro(mublasCgemmBatched);            \
+  __macro(mublasZgemmBatched);            \
+  __macro(mublasStrsmBatched);            \
+  __macro(mublasDtrsmBatched);            \
+  __macro(mublasCtrsmBatched);            \
+  __macro(mublasZtrsmBatched);            
+  // __macro(mublasHgemm);                   
+  //__macro(mublasSgemmEx);                 
+  //__macro(mublasSgetrfBatched);           
+  //__macro(mublasSgetriBatched);           
+  //__macro(mublasDgetrfBatched);           
+  //__macro(mublasDgetriBatched);           
+  //__macro(mublasSmatinvBatched);
+  //__macro(mublasDmatinvBatched);          
+  //__macro(mublasSgetrsBatched);
+//  __macro(mublasDgetrsBatched);
+
+MUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#define MUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
+  __macro(mublasGemmEx);                     \
+  __macro(mublasSgemmStridedBatched);        \
+  __macro(mublasDgemmStridedBatched);        \
+  __macro(mublasCgemmStridedBatched);        \
+  __macro(mublasZgemmStridedBatched);        \
+  __macro(mublasHgemmStridedBatched);
+
+MUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#define MUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
+  __macro(mublasSetMathMode);                \
+  __macro(mublasGetMathMode);
+
+MUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#define MUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
+  __macro(mublasGemmBatchedEx);              
+  // __macro(mublasGemmStridedBatchedEx);
+
+MUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP)
+
+#undef DECLARE_DYNAMIC_LOAD_MUBLAS_WRAP
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mudnn.cc b/paddle/phi/backends/dynload/mudnn.cc
new file mode 100644
index 00000000000000..cd193688bc347d
--- /dev/null
+++ b/paddle/phi/backends/dynload/mudnn.cc
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mudnn.h"
+
+namespace phi {
+namespace dynload {
+
+bool HasCUDNN() {
+  // note: mudnn.so is not imported by dlopen, which will be linked
+  // in cmakelist.txt.
+  return true;
+}
+
+void mudnnCreate(Handle** handle, int device) { *handle = new Handle(device); }
+
+void mudnnSetStream(Handle* handle, musaStream_t stream) {
+  handle->SetStream(stream);
+}
+
+void mudnnDestroy(Handle* handle) {
+  if (handle != nullptr) {
+    delete handle;
+    handle = nullptr;
+  }
+}
+
+}  // namespace dynload
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/dynload/mudnn.h b/paddle/phi/backends/dynload/mudnn.h
new file mode 100644
index 00000000000000..d05f32a8b5df05
--- /dev/null
+++ b/paddle/phi/backends/dynload/mudnn.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MUSA
+#include <mudnn.h>
+#include <musa_runtime_api.h>
+
+namespace phi {
+namespace dynload {
+
+using ::musa::dnn::BatchNorm;
+using ::musa::dnn::Convolution;
+using ::musa::dnn::Handle;
+using ::musa::dnn::MemoryHandler;
+using ::musa::dnn::Pooling;
+using ::musa::dnn::Softmax;
+using ::musa::dnn::Tensor;
+
+extern bool HasCUDNN();
+
+void mudnnCreate(Handle** handle, int device);
+
+void mudnnSetStream(Handle* handle, musaStream_t stream);
+
+void mudnnDestroy(Handle* handle);
+
+}  // namespace dynload
+}  // namespace phi
+#endif
diff --git a/paddle/phi/backends/dynload/mufft.cc b/paddle/phi/backends/dynload/mufft.cc
new file mode 100644
index 00000000000000..9e30463ea39fa1
--- /dev/null
+++ b/paddle/phi/backends/dynload/mufft.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/mufft.h"
+
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace dynload {
+std::once_flag mufft_dso_flag;
+void* mufft_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUFFT_FFT_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasMUFFT() {
+  std::call_once(mufft_dso_flag,
+                 []() { mufft_dso_handle = GetMUFFTDsoHandle(); });
+  return mufft_dso_handle != nullptr;
+}
+
+void EnforceMUFFTLoaded(const char* fn_name) {
+  PADDLE_ENFORCE_NOT_NULL(
+      mufft_dso_handle,
+      phi::errors::PreconditionNotMet(
+          "Cannot load mufft shared library. Cannot invoke method %s.",
+          fn_name));
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/mufft.h b/paddle/phi/backends/dynload/mufft.h
new file mode 100644
index 00000000000000..70bfdd4c1efd18
--- /dev/null
+++ b/paddle/phi/backends/dynload/mufft.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MUSA
+#include <mufft.h>
+#include <mufftXt.h>
+#include <glog/logging.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag mufft_dso_flag;
+extern void* mufft_dso_handle;
+
+extern void EnforceMUFFTLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_MUFFT_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using mufft_func = decltype(&::__name);                        \
+      std::call_once(mufft_dso_flag, []() {                          \
+        mufft_dso_handle = phi::dynload::GetMUFFTDsoHandle();        \
+      });                                                            \
+      EnforceMUFFTLoaded(#__name);                                   \
+      static void* p_##__name = dlsym(mufft_dso_handle, #__name);    \
+      return reinterpret_cast<mufft_func>(p_##__name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed mufft functions in HPPL
+ * different mufft version has different interfaces
+ **/
+#define MUFFT_FFT_ROUTINE_EACH(__macro)  \
+  __macro(mufftPlan1d);                  \
+  __macro(mufftPlan2d);                  \
+  __macro(mufftPlan3d);                  \
+  __macro(mufftPlanMany);                \
+  __macro(mufftMakePlan1d);              \
+  __macro(mufftMakePlan2d);              \
+  __macro(mufftMakePlan3d);              \
+  __macro(mufftMakePlanMany);            \
+  __macro(mufftEstimate1d);              \
+  __macro(mufftEstimate2d);              \
+  __macro(mufftEstimate3d);              \
+  __macro(mufftEstimateMany);            \
+  __macro(mufftCreate);                  \
+  __macro(mufftGetSize1d);               \
+  __macro(mufftGetSize2d);               \
+  __macro(mufftGetSize3d);               \
+  __macro(mufftGetSizeMany);             \
+  __macro(mufftGetSize);                 \
+  __macro(mufftSetWorkArea);             \
+  __macro(mufftSetAutoAllocation);       \
+  __macro(mufftExecC2C);                 \
+  __macro(mufftExecR2C);                 \
+  __macro(mufftExecC2R);                 \
+  __macro(mufftExecZ2Z);                 \
+  __macro(mufftExecD2Z);                 \
+  __macro(mufftExecZ2D);                 \
+  __macro(mufftSetStream);               \
+  __macro(mufftDestroy);                 \
+  __macro(mufftGetVersion);              \
+  __macro(mufftGetProperty);             \
+  __macro(mufftXtSetGPUs);               \
+  __macro(mufftXtMalloc);                \
+  __macro(mufftXtMemcpy);                \
+  __macro(mufftXtFree);                  \
+  __macro(mufftXtExecDescriptorC2C);     \
+  __macro(mufftXtExecDescriptorR2C);     \
+  __macro(mufftXtExecDescriptorC2R);     \
+  __macro(mufftXtExecDescriptorZ2Z);     \
+  __macro(mufftXtExecDescriptorD2Z);     \
+  __macro(mufftXtExecDescriptorZ2D);     \
+  __macro(mufftXtQueryPlan);             \
+  __macro(mufftXtSetCallback);           \
+  __macro(mufftXtClearCallback);         \
+  __macro(mufftXtMakePlanMany);          \
+  __macro(mufftXtGetSizeMany);           \
+  __macro(mufftXtExec);                  \
+  __macro(mufftXtExecDescriptor);        
+MUFFT_FFT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUFFT_WRAP)
+
+
+inline const char *mufftGetErrorString(mufftResult_t status) {
+  switch (status) {
+    case MUFFT_SUCCESS:
+      return "'MUFFT_SUCCESS'. The mufft operation was successful.";
+    case MUFFT_INVALID_PLAN:
+      return "'MUFFT_INVALID_PLAN'. mufft was passed an invalid plan handle.";
+    case MUFFT_ALLOC_FAILED:
+      return "'MUFFT_ALLOC_FAILED'. mufft failed to allocate GPU or CPU "
+             "memory.";
+    case MUFFT_INVALID_TYPE:
+      return "'MUFFT_INVALID_TYPE'. No longer used.";
+    case MUFFT_INVALID_VALUE:
+      return "'MUFFT_INVALID_VALUE'. User specified an invalid pointer or "
+             "parameter.";
+    case MUFFT_INTERNAL_ERROR:
+      return "'MUFFT_INTERNAL_ERROR'. Driver or internal mufft library "
+             "error.";
+    case MUFFT_EXEC_FAILED:
+      return "'MUFFT_EXEC_FAILED'. Failed to execute an FFT on the GPU.";
+    case MUFFT_SETUP_FAILED:
+      return "'MUFFT_SETUP_FAILED'. The mufft library failed to initialize.";
+    case MUFFT_INVALID_SIZE:
+      return "'MUFFT_INVALID_SIZE'. User specified an invalid transform size.";
+    case MUFFT_UNALIGNED_DATA:
+      return "'MUFFT_UNALIGNED_DATA'. No longer used.";
+    case MUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "'MUFFT_INCOMPLETE_PARAMETER_LIST'. Missing parameters in call.";
+    case MUFFT_INVALID_DEVICE:
+      return "'MUFFT_INVALID_DEVICE'. Execution of a plan was on different "
+             "GPU than plan creation.";
+    case MUFFT_PARSE_ERROR:
+      return "'MUFFT_PARSE_ERROR'. Internal plan database error.";
+    case MUFFT_NO_WORKSPACE:
+      return "'MUFFT_NO_WORKSPACE'. No workspace has been provided prior to "
+             "plan execution.";
+    case MUFFT_NOT_IMPLEMENTED:
+      return "'MUFFT_NOT_IMPLEMENTED'. Function does not implement "
+             "functionality for parameters given.";
+    case MUFFT_LICENSE_ERROR:
+      return "'MUFFT_LICENSE_ERROR'. Operation is not supported for "
+             "parameters given.";
+    case MUFFT_NOT_SUPPORTED:
+      return "'MUFFT_NOT_SUPPORTED'. Operation is not supported for "
+             "parameters given.";                 
+    default:
+      return "mufft_STATUS_UNKNOWN_ERROR";
+  }
+}
+
+}  // namespace dynload
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/backends/dynload/murand.cc b/paddle/phi/backends/dynload/murand.cc
new file mode 100644
index 00000000000000..bbeeb7bcd58981
--- /dev/null
+++ b/paddle/phi/backends/dynload/murand.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/murand.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag murand_dso_flag;
+void *murand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/murand.h b/paddle/phi/backends/dynload/murand.h
new file mode 100644
index 00000000000000..28380cd9423f04
--- /dev/null
+++ b/paddle/phi/backends/dynload/murand.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <murand.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag murand_dso_flag;
+extern void *murand_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    murandStatus_t operator()(Args... args) {                      \
+      using murandFunc = decltype(&::__name);                      \
+      std::call_once(murand_dso_flag, []() {                       \
+        murand_dso_handle = phi::dynload::GetCurandDsoHandle();    \
+      });                                                           \
+      static void *p_##__name = dlsym(murand_dso_handle, #__name); \
+      return reinterpret_cast<murandFunc>(p_##__name)(args...);    \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+
+#define MURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(murandCreateGenerator);              \
+  __macro(murandSetStream);                    \
+  __macro(murandSetPseudoRandomGeneratorSeed); \
+  __macro(murandGenerateUniform);              \
+  __macro(murandGenerateUniformDouble);        \
+  __macro(murandGenerateNormal);               \
+  __macro(murandDestroyGenerator);
+
+MURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musa_driver.cc b/paddle/phi/backends/dynload/musa_driver.cc
new file mode 100644
index 00000000000000..2173a8d6cdd819
--- /dev/null
+++ b/paddle/phi/backends/dynload/musa_driver.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/musa_driver.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag musa_dso_flag;
+void* musa_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSA_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasCUDADriver() {
+  std::call_once(musa_dso_flag, []() { musa_dso_handle = GetCUDADsoHandle(); });
+  return musa_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musa_driver.h b/paddle/phi/backends/dynload/musa_driver.h
new file mode 100644
index 00000000000000..3534ab8213c936
--- /dev/null
+++ b/paddle/phi/backends/dynload/musa_driver.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <musa.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag musa_dso_flag;
+extern void* musa_dso_handle;
+extern bool HasCUDADriver();
+
+#define DECLARE_DYNAMIC_LOAD_MUSA_WRAP(__name)                       \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using musa_func = decltype(&::__name);                         \
+      std::call_once(musa_dso_flag, []() {                           \
+        musa_dso_handle = phi::dynload::GetCUDADsoHandle();          \
+      });                                                            \
+      static void* p_##__name = dlsym(musa_dso_handle, #__name);     \
+      return reinterpret_cast<musa_func>(p_##__name)(args...);       \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed musa driver functions
+ **/
+#define MUSA_ROUTINE_EACH(__macro)                      \
+  __macro(muInit);                                      \
+  __macro(muDriverGetVersion);                          \
+  __macro(muGetErrorString);                            \
+  __macro(muModuleLoadData);                            \
+  __macro(muModuleGetFunction);                         \
+  __macro(muModuleUnload);                              \
+  __macro(muOccupancyMaxActiveBlocksPerMultiprocessor); \
+  __macro(muLaunchKernel);                              \
+  __macro(muCtxCreate);                                 \
+  __macro(muCtxGetCurrent);                             \
+  __macro(muDeviceGetCount);                            \
+  __macro(muDevicePrimaryCtxGetState);                  \
+  __macro(muDeviceGetAttribute);                        \
+  __macro(muDeviceGet);
+
+MUSA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSA_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_MUSA_WRAP
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musartc.cc b/paddle/phi/backends/dynload/musartc.cc
new file mode 100644
index 00000000000000..9cd25270a10167
--- /dev/null
+++ b/paddle/phi/backends/dynload/musartc.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/musartc.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag musartc_dso_flag;
+void* musartc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSARTC_ROUTINE_EACH(DEFINE_WRAP);
+
+bool HasNVRTC() {
+  std::call_once(musartc_dso_flag,
+                 []() { musartc_dso_handle = GetNVRTCDsoHandle(); });
+  return musartc_dso_handle != nullptr;
+}
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musartc.h b/paddle/phi/backends/dynload/musartc.h
new file mode 100644
index 00000000000000..ee85bebc503ec0
--- /dev/null
+++ b/paddle/phi/backends/dynload/musartc.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// #include <mtrtc.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+#include "paddle/phi/core/enforce.h"
+
+// TODO(MTAI): The following musa runtime compiling functions are not supported
+// now. Here empty implementations are given temporarily. When compiler MCC
+// supports these functions, we will replace them.
+typedef struct _mtrtcProgram *mtrtcProgram;
+
+typedef enum {
+  MTRTC_SUCCESS = 0,
+  MTRTC_ERROR_OUT_OF_MEMORY = 1,
+  MTRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  MTRTC_ERROR_INVALID_INPUT = 3,
+  MTRTC_ERROR_INVALID_PROGRAM = 4,
+  MTRTC_ERROR_INVALID_OPTION = 5,
+  MTRTC_ERROR_COMPILATION = 6,
+  MTRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  MTRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  MTRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  MTRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  MTRTC_ERROR_INTERNAL_ERROR = 11
+} mtrtcResult;
+
+inline mtrtcResult mtrtcVersion(int *major, int *minor) {
+  PADDLE_THROW(
+      phi::errors::Unimplemented("mtrtcVersion is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+inline const char *mtrtcGetErrorString(mtrtcResult result) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "mtrtcGetErrorString is not supported on MUSA now!"));
+  return "mtrtcGetErrorString is not supported on MUSA now!";
+}
+
+inline mtrtcResult mtrtcCompileProgram(mtrtcProgram prog,
+                                       int numOptions,
+                                       const char *const *options) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "mtrtcCompileProgram is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+inline mtrtcResult mtrtcCreateProgram(mtrtcProgram *prog,
+                                      const char *src,
+                                      const char *name,
+                                      int numHeaders,
+                                      const char *const *headers,
+                                      const char *const *includeNames) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "mtrtcCreateProgram is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+inline mtrtcResult mtrtcDestroyProgram(mtrtcProgram *prog) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "mtrtcDestroyProgram is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+inline mtrtcResult mtrtcGetMUSA(mtrtcProgram prog, char *musa) {
+  PADDLE_THROW(
+      phi::errors::Unimplemented("mtrtcGetMUSA is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+inline mtrtcResult mtrtcGetMUSASize(mtrtcProgram prog, size_t *musaSizeRet) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "mtrtcGetMUSASize is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+inline mtrtcResult mtrtcGetProgramLog(mtrtcProgram prog, char *log) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "mtrtcGetProgramLog is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+inline mtrtcResult mtrtcGetProgramLogSize(mtrtcProgram prog,
+                                          size_t *logSizeRet) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "mtrtcGetProgramLogSize is not supported on MUSA now!"));
+  return mtrtcResult::MTRTC_ERROR_INTERNAL_ERROR;
+}
+
+namespace phi {
+namespace dynload {
+
+extern std::once_flag musartc_dso_flag;
+extern void *musartc_dso_handle;
+extern bool HasNVRTC();
+
+#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name)                      \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
+      using musartc_func = decltype(&::__name);                      \
+      std::call_once(musartc_dso_flag, []() {                        \
+        musartc_dso_handle = phi::dynload::GetNVRTCDsoHandle();      \
+      });                                                            \
+      static void *p_##__name = dlsym(musartc_dso_handle, #__name);  \
+      return reinterpret_cast<musartc_func>(p_##__name)(args...);    \
+    }                                                                \
+  };                                                                 \
+  extern struct DynLoad__##__name __name
+
+/**
+ * include all needed musartc functions
+ **/
+#define MUSARTC_ROUTINE_EACH(__macro) \
+  __macro(mtrtcVersion);              \
+  __macro(mtrtcGetErrorString);       \
+  __macro(mtrtcCompileProgram);       \
+  __macro(mtrtcCreateProgram);        \
+  __macro(mtrtcDestroyProgram);       \
+  __macro(mtrtcGetMUSA);              \
+  __macro(mtrtcGetMUSASize);          \
+  __macro(mtrtcGetProgramLog);        \
+  __macro(mtrtcGetProgramLogSize)
+
+MUSARTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP
+
+}  // namespace dynload
+}  // namespace phi
diff --git a/paddle/phi/backends/dynload/musparse.cc b/paddle/phi/backends/dynload/musparse.cc
new file mode 100644
index 00000000000000..40d766f963c40c
--- /dev/null
+++ b/paddle/phi/backends/dynload/musparse.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/dynload/musparse.h"
+
+namespace phi {
+namespace dynload {
+
+std::once_flag musparse_dso_flag;
+void *musparse_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+MUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/backends/dynload/musparse.h b/paddle/phi/backends/dynload/musparse.h
new file mode 100644
index 00000000000000..e63182943190d5
--- /dev/null
+++ b/paddle/phi/backends/dynload/musparse.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include <musa.h>
+#include <musparse.h>
+
+#include <mutex>  // NOLINT
+
+#include "paddle/phi/backends/dynload/dynamic_loader.h"
+#include "paddle/phi/backends/dynload/port.h"
+
+namespace phi {
+namespace dynload {
+extern std::once_flag musparse_dso_flag;
+extern void *musparse_dso_handle;
+
+#define DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP(__name)                   \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    musparseStatus_t operator()(Args... args) {                      \
+      using Func = decltype(&::__name);                              \
+      std::call_once(musparse_dso_flag, []() {                       \
+        musparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
+      });                                                            \
+      static void *p_##__name = dlsym(musparse_dso_handle, #__name); \
+      return reinterpret_cast<Func>(p_##__name)(args...);            \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#if defined(PADDLE_WITH_MUSA)
+#define MUSPARSE_ROUTINE_EACH(__macro)   \
+  __macro(musparseCreateHandle);         \
+  __macro(musparseDestroyHandle);               \
+  __macro(musparseSetStream);            \
+  __macro(musparseCreateMatDescr);       \
+  __macro(musparseSnnz);                 \
+  __macro(musparseDnnz);                 \
+  __macro(musparseSetMatType);           \
+  __macro(musparseSetMatIndexBase);      \
+  __macro(musparseCreateCsr);            \
+  __macro(musparseCreateCoo);            \
+  __macro(musparseCreateDnMat);          \
+  __macro(musparseCreateDnVec);          \
+  __macro(musparseSpMM);                 \
+  __macro(musparseDestroySpMat);         \
+  __macro(musparseDestroyDnMat);         \
+  __macro(musparseDestroyDnVec);         \
+  __macro(musparseSpMV);                 \
+  __macro(musparseSDDMM_bufferSize);     \
+  __macro(musparseSDDMM_preprocess);     \
+  __macro(musparseSDDMM);                \
+  __macro(musparseDnMatSetStridedBatch); \
+  __macro(musparseCooSetStridedBatch);   \
+  __macro(musparseCsrSetStridedBatch);
+
+MUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP)
+
+#endif  // PADDLE_WITH_MUSA
+
+#undef DECLARE_DYNAMIC_LOAD_MUSPARSE_WRAP
+}  // namespace dynload
+}  // namespace phi
+
diff --git a/paddle/phi/backends/dynload/nccl.h b/paddle/phi/backends/dynload/nccl.h
index 91b6f5dcd58dc5..a5759b67e8df78 100644
--- a/paddle/phi/backends/dynload/nccl.h
+++ b/paddle/phi/backends/dynload/nccl.h
@@ -42,18 +42,18 @@ extern void* nccl_dso_handle;
 
 #define NCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(ncclGetUniqueId);             \
+  __macro(mcclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommAbort);               \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(ncclAllReduce);               \
-  __macro(ncclBcast);                   \
-  __macro(ncclAllGather);               \
-  __macro(ncclGroupStart);              \
-  __macro(ncclGroupEnd);                \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclCommGetAsyncError);       \
@@ -67,7 +67,7 @@ NCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2304
-#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
+#define NCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
 NCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
 #endif
 
diff --git a/paddle/phi/backends/dynload/rccl.h b/paddle/phi/backends/dynload/rccl.h
index e1018a3f253fa5..651cc9c68b2438 100644
--- a/paddle/phi/backends/dynload/rccl.h
+++ b/paddle/phi/backends/dynload/rccl.h
@@ -42,18 +42,18 @@ extern void* rccl_dso_handle;
 
 #define RCCL_RAND_ROUTINE_EACH(__macro) \
   __macro(ncclCommInitAll);             \
-  __macro(ncclGetUniqueId);             \
+  __macro(mcclGetUniqueId);             \
   __macro(ncclCommInitRank);            \
   __macro(ncclCommAbort);               \
   __macro(ncclCommDestroy);             \
   __macro(ncclCommCount);               \
   __macro(ncclCommCuDevice);            \
   __macro(ncclCommUserRank);            \
-  __macro(ncclAllReduce);               \
-  __macro(ncclBcast);                   \
-  __macro(ncclAllGather);               \
-  __macro(ncclGroupStart);              \
-  __macro(ncclGroupEnd);                \
+  __macro(mcclAllReduce);               \
+  __macro(mcclBcast);                   \
+  __macro(mcclAllGather);               \
+  __macro(mcclGroupStart);              \
+  __macro(mcclGroupEnd);                \
   __macro(ncclReduce);                  \
   __macro(ncclReduceScatter);           \
   __macro(ncclCommGetAsyncError);       \
@@ -67,7 +67,7 @@ RCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
 #if NCCL_VERSION_CODE >= 2304
-#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(ncclGetVersion);
+#define RCCL_RAND_ROUTINE_EACH_AFTER_2304(__macro) __macro(mcclGetVersion);
 RCCL_RAND_ROUTINE_EACH_AFTER_2304(DECLARE_DYNAMIC_LOAD_RCCL_WRAP)
 #endif
 
diff --git a/paddle/phi/backends/gpu/forwards.h b/paddle/phi/backends/gpu/forwards.h
index e1f3492f768702..2b733c01bc01b5 100644
--- a/paddle/phi/backends/gpu/forwards.h
+++ b/paddle/phi/backends/gpu/forwards.h
@@ -72,6 +72,25 @@ using cufftHandle = int;
 // Forward declaration of NCCL types.
 using ncclComm_t = struct ncclComm *;
 
+
+
+
+// Forward declaration of MUSA runtime types.
+using musaStream_t = struct MUstream_st *;
+using musaEvent_t = struct MUevent_st *;
+using mublasHandle_t = struct _mublasHandle_t *;
+namespace musa {
+namespace dnn {
+struct Handle;
+}  // namespace dnn
+}  // namespace musa
+using mudnnHandle_t = musa::dnn::Handle *;
+using musparseHandle_t = struct _musparse_handle *;
+using mublasLtHandle_t = struct mublasLtContext *;
+using mcclComm_t = struct mcclComm *;
+
+
+
 /// Forward declaration of ROCM types.
 #include <cstddef>
 
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index 8d46c3e34cabdf..f250fb365ce85b 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -51,6 +51,16 @@ limitations under the License. */
 #endif  // !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 #endif  // PADDLE_WITH_HIP
 
+
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/mublas.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#endif  // PADDLE_WITH_MUSA
+
+
 // NOTE: The paddle framework should add WITH_EIGEN option to support compile
 // without eigen.
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -119,6 +129,9 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));          
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream()));
@@ -143,6 +156,11 @@ static void StreamCallbackFunc(gpuStream_t stream,
                                gpuError_t status,
                                void* user_data)
 #endif
+#ifdef PADDLE_WITH_MUSA
+static void StreamCallbackFunc(gpuStream_t stream,
+                               gpuError_t status,
+                               void* user_data)
+#endif
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     static void CUDART_CB StreamCallbackFunc(void* user_data)
@@ -170,6 +188,8 @@ void DnnWorkspaceHandle::RunFuncSync(
     std::lock_guard<std::mutex> guard(*mtx_);
 #ifdef PADDLE_WITH_HIP
     auto status = hipMalloc(&workspace_ptr, size);
+#elif defined(PADDLE_WITH_MUSA)
+    auto status = musaMalloc(&workspace_ptr, size);
 #else
     auto status = cudaMalloc(&workspace_ptr, size);
 #endif
@@ -178,6 +198,8 @@ void DnnWorkspaceHandle::RunFuncSync(
       phi::backends::gpu::GpuStreamSync(stream_);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaFree(workspace_ptr));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
 #endif
@@ -248,9 +270,9 @@ struct GPUContext::Impl {
       DestoryInternalWorkspace();
       DestoryInternalEigenDevice();
       phi::DestroySparseHandle(sparse_handle_);
-      phi::DestroySolverHandle(solver_handle_);
+      // phi::DestroySolverHandle(solver_handle_);
       phi::DestroyDnnHandle(dnn_handle_);
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       if (nccl_comm_) {
         // NOTE(liyurui): It is not recommend calling CUDA runtime API
         // in destructor. Since we can not ensure the release order of
@@ -264,7 +286,7 @@ struct GPUContext::Impl {
       phi::DestroyBlasHandle(blas_handle_);
       phi::DestroyBlasHandle(blas_tensor_core_handle_);
       phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
-      phi::DestroyBlasLtHandle(blaslt_handle_);
+      // phi::DestroyBlasLtHandle(blaslt_handle_);
     }
     if (stream_owned_ && stream_) {
       delete stream_;
@@ -425,24 +447,24 @@ struct GPUContext::Impl {
     blas_tf32_tensor_core_handle_creator_ = std::move(handle_creator);
   }
 
-  void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
+  // void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
 
-  void SetBlasLtHandle(std::function<blasLtHandle_t()>&& handle_creator) {
-    blaslt_handle_creator_ = std::move(handle_creator);
-  }
+  // void SetBlasLtHandle(std::function<blasLtHandle_t()>&& handle_creator) {
+  //   blaslt_handle_creator_ = std::move(handle_creator);
+  // }
 
-  blasLtHandle_t GetBlasLtHandle() {
-    std::call_once(flag_blaslt_, [&]() {
-      if (!blaslt_handle_) {
-        if (!blaslt_handle_creator_)
-          phi::InitBlasLtHandle(&blaslt_handle_);
-        else
-          blaslt_handle_ = blaslt_handle_creator_();
-      }
-    });
-    PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
-    return blaslt_handle_;
-  }
+  // blasLtHandle_t GetBlasLtHandle() {
+  //   std::call_once(flag_blaslt_, [&]() {
+  //     if (!blaslt_handle_) {
+  //       if (!blaslt_handle_creator_)
+  //         phi::InitBlasLtHandle(&blaslt_handle_);
+  //       else
+  //         blaslt_handle_ = blaslt_handle_creator_();
+  //     }
+  //   });
+  //   PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
+  //   return blaslt_handle_;
+  // }
 
   dnnHandle_t GetDnnHandle() {
     std::call_once(flag_dnn_, [&]() {
@@ -464,6 +486,11 @@ struct GPUContext::Impl {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(dnn_handle_));
       dnn_handle_ = nullptr;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    if (owned_ && dnn_handle_ != nullptr) {
+      phi::dynload::mudnnDestroy(dnn_handle_);
+      dnn_handle_ = nullptr;
+    }    
 #else
     if (owned_ && dnn_handle_ != nullptr) {
       PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(dnn_handle_));
@@ -478,25 +505,25 @@ struct GPUContext::Impl {
     dnn_handle_creator_ = std::move(handle_creator);
   }
 
-  solverHandle_t GetSolverHandle() {
-    std::call_once(flag_slover_, [&]() {
-      if (!solver_handle_) {
-        if (!solver_handle_creator_) {
-          phi::InitSolverHandle(&solver_handle_, stream());
-        } else {
-          solver_handle_ = solver_handle_creator_();
-        }
-      }
-    });
-    PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
-    return solver_handle_;
-  }
+  // solverHandle_t GetSolverHandle() {
+  //   std::call_once(flag_slover_, [&]() {
+  //     if (!solver_handle_) {
+  //       if (!solver_handle_creator_) {
+  //         phi::InitSolverHandle(&solver_handle_, stream());
+  //       } else {
+  //         solver_handle_ = solver_handle_creator_();
+  //       }
+  //     }
+  //   });
+  //   PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
+  //   return solver_handle_;
+  // }
 
-  void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
+  // void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
 
-  void SetSolverHandle(std::function<solverHandle_t()>&& handle_creator) {
-    solver_handle_creator_ = std::move(handle_creator);
-  }
+  // void SetSolverHandle(std::function<solverHandle_t()>&& handle_creator) {
+  //   solver_handle_creator_ = std::move(handle_creator);
+  // }
 
   sparseHandle_t GetSparseHandle() {
     std::call_once(flag_sparse_, [&]() {
@@ -529,6 +556,9 @@ struct GPUContext::Impl {
       break;
     }
 #endif  // !defined(_WIN32)
+#elif defined(PADDLE_WITH_MUSA)
+    musaError_t e_sync = musaSuccess;
+    e_sync = musaStreamSynchronize(stream());
 #else   // PADDLE_WITH_HIP
     cudaError_t e_sync = cudaSuccess;
 #if !defined(_WIN32)
@@ -547,21 +577,23 @@ struct GPUContext::Impl {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(stream(), ev, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(stream(), ev, 0));
 #endif
   }
 
-  ncclComm_t GetNcclComm() const {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  mcclComm_t GetNcclComm() const {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     // PD_CHECK(nccl_comm_ != nullptr, "the gpu nccl_comm is nullptr.");
     return nccl_comm_;
 #endif
     return nullptr;
   }
 
-  void SetNcclComm(ncclComm_t comm) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  void SetNcclComm(mcclComm_t comm) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
     nccl_comm_ = comm;
 #endif
   }
@@ -678,6 +710,8 @@ struct GPUContext::Impl {
   void RecordEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(ev, stream()));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaEventRecord(ev, stream()));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(ev, stream()));
 #endif
@@ -700,6 +734,12 @@ struct GPUContext::Impl {
     PADDLE_ENFORCE_GPU_SUCCESS(
         hipStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaStreamAddCallback(stream(), internal::StreamCallbackFunc, func, 0));
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 10000
     PADDLE_ENFORCE_GPU_SUCCESS(
@@ -712,7 +752,7 @@ struct GPUContext::Impl {
   }
 
   void WaitStreamCallback() const {
-#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUDA)
     phi::backends::gpu::GpuStreamSync(stream());
 #endif
     {
@@ -764,12 +804,12 @@ struct GPUContext::Impl {
   std::function<blasHandle_t()> blas_tensor_core_handle_creator_{nullptr};
   blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
   std::function<blasHandle_t()> blas_tf32_tensor_core_handle_creator_{nullptr};
-  blasLtHandle_t blaslt_handle_{nullptr};
-  std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
+  // blasLtHandle_t blaslt_handle_{nullptr};
+  // std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
   dnnHandle_t dnn_handle_{nullptr};
   std::function<dnnHandle_t()> dnn_handle_creator_{nullptr};
-  solverHandle_t solver_handle_{nullptr};
-  std::function<solverHandle_t()> solver_handle_creator_{nullptr};
+  // solverHandle_t solver_handle_{nullptr};
+  // std::function<solverHandle_t()> solver_handle_creator_{nullptr};
   sparseHandle_t sparse_handle_{nullptr};
   std::function<sparseHandle_t()> sparse_handle_creator_{nullptr};
   DnnWorkspaceHandle* workspace_{nullptr};
@@ -783,7 +823,7 @@ struct GPUContext::Impl {
   std::once_flag flag_tensorcore_cublas_;
   std::once_flag flag_eigen_device_;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs
   // both within and across nodes.
@@ -792,7 +832,7 @@ struct GPUContext::Impl {
 
   // NOTE: Distributed communicator, distributed framework manages its
   // resources.
-  ncclComm_t nccl_comm_{nullptr};
+  mcclComm_t nccl_comm_{nullptr};
 #endif
 
   mutable std::mutex blas_mtx_;
@@ -839,13 +879,13 @@ blasHandle_t GPUContext::cublas_handle() const {
   return impl_->GetBlasHandle();
 }
 
-blasLtHandle_t GPUContext::cublaslt_handle() const {
-  return impl_->GetBlasLtHandle();
-}
+// blasLtHandle_t GPUContext::cublaslt_handle() const {
+//   return impl_->GetBlasLtHandle();
+// }
 
-solverHandle_t GPUContext::cusolver_dn_handle() const {
-  return impl_->GetSolverHandle();
-}
+// solverHandle_t GPUContext::cusolver_dn_handle() const {
+//   return impl_->GetSolverHandle();
+// }
 
 sparseHandle_t GPUContext::cusparse_handle() const {
   return impl_->GetSparseHandle();
@@ -914,9 +954,9 @@ void GPUContext::AddStreamCallback(
 
 void GPUContext::WaitStreamCallback() const { impl_->WaitStreamCallback(); }
 
-ncclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
+mcclComm_t GPUContext::nccl_comm() const { return impl_->GetNcclComm(); }
 
-void GPUContext::set_nccl_comm(ncclComm_t comm) { impl_->SetNcclComm(comm); }
+void GPUContext::set_nccl_comm(mcclComm_t comm) { impl_->SetNcclComm(comm); }
 
 void GPUContext::Init() {
   impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());  // NOLINT
@@ -965,13 +1005,13 @@ void GPUContext::SetBlasTF32Handle(std::function<blasHandle_t()>&& func) {
   impl_->SetBlasTF32Handle(std::move(func));
 }
 
-void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
-  impl_->SetBlasLtHandle(blaslt);
-}
+// void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
+//   impl_->SetBlasLtHandle(blaslt);
+// }
 
-void GPUContext::SetBlasLtHandle(std::function<blasLtHandle_t()>&& func) {
-  impl_->SetBlasLtHandle(std::move(func));
-}
+// void GPUContext::SetBlasLtHandle(std::function<blasLtHandle_t()>&& func) {
+//   impl_->SetBlasLtHandle(std::move(func));
+// }
 
 void GPUContext::SetDnnHandle(dnnHandle_t handle) {
   impl_->SetDnnHandle(handle);
@@ -981,13 +1021,13 @@ void GPUContext::SetDnnHandle(std::function<dnnHandle_t()>&& func) {
   impl_->SetDnnHandle(std::move(func));
 }
 
-void GPUContext::SetSolverHandle(solverHandle_t handle) {
-  impl_->SetSolverHandle(handle);
-}
+// void GPUContext::SetSolverHandle(solverHandle_t handle) {
+//   impl_->SetSolverHandle(handle);
+// }
 
-void GPUContext::SetSolverHandle(std::function<solverHandle_t()>&& func) {
-  impl_->SetSolverHandle(std::move(func));
-}
+// void GPUContext::SetSolverHandle(std::function<solverHandle_t()>&& func) {
+//   impl_->SetSolverHandle(std::move(func));
+// }
 
 void GPUContext::SetSparseHandle(sparseHandle_t handle) {
   impl_->SetSparseHandle(handle);
@@ -1046,7 +1086,7 @@ void GPUContext::SetDnnAttr(const std::string& attr_name, Attribute attr) {
 
 void GPUContext::ClearDnnAttr() { return impl_->ClearDnnAttr(); }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 GPUPinnedContext::GPUPinnedContext() {
   eigen_device_ = std::make_unique<Eigen::DefaultDevice>();
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 8cd0d414bc105b..19eb5dd05cd3c1 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU_KP)
 
 #include <array>
@@ -109,10 +109,10 @@ class PADDLE_API GPUContext : public DeviceContext,
   blasHandle_t cublas_handle() const;
 
   /*! \brief  Return cublasLt handle in the device context. */
-  blasLtHandle_t cublaslt_handle() const;
+  // blasLtHandle_t cublaslt_handle() const;
 
   /*! \brief  Return cusolver handle in the device context. */
-  solverHandle_t cusolver_dn_handle() const;
+  // solverHandle_t cusolver_dn_handle() const;
 
   /*! \brief  Return cusparse handle in the device context. */
   sparseHandle_t cusparse_handle() const;
@@ -183,10 +183,10 @@ class PADDLE_API GPUContext : public DeviceContext,
 
  public:
   /*! \brief  Return nccl communicators. */
-  ncclComm_t nccl_comm() const;
+  mcclComm_t nccl_comm() const;
 
   /*! \brief  Set nccl communicators. */
-  void set_nccl_comm(ncclComm_t comm);
+  void set_nccl_comm(mcclComm_t comm);
 
  public:
   // NOTE: DeviceContext hold resources. Used in training scenarios.
@@ -232,14 +232,14 @@ class PADDLE_API GPUContext : public DeviceContext,
   void SetBlasTF32Handle(blasHandle_t);
   void SetBlasTF32Handle(std::function<blasHandle_t()>&&);
 
-  void SetBlasLtHandle(blasLtHandle_t);
-  void SetBlasLtHandle(std::function<blasLtHandle_t()>&&);
+  // void SetBlasLtHandle(blasLtHandle_t);
+  // void SetBlasLtHandle(std::function<blasLtHandle_t()>&&);
 
   void SetDnnHandle(dnnHandle_t);
   void SetDnnHandle(std::function<dnnHandle_t()>&&);
 
-  void SetSolverHandle(solverHandle_t);
-  void SetSolverHandle(std::function<solverHandle_t()>&&);
+  // void SetSolverHandle(solverHandle_t);
+  // void SetSolverHandle(std::function<solverHandle_t()>&&);
 
   void SetSparseHandle(sparseHandle_t);
   void SetSparseHandle(std::function<sparseHandle_t()>&&);
@@ -276,7 +276,7 @@ using GPUDNNContext = GPUContext;
 // because we want to implement a KPS-based kernel and make it run
 // on GPU and XPU at the same time, so we need KPSContext when registering
 // KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 using KPSContext = GPUContext;
 #endif
 
@@ -287,7 +287,7 @@ struct DefaultDevice;
 }  // namespace Eigen
 
 namespace phi {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // Currently, GPUPinnedContext is only used to data copying.
 class GPUPinnedContext
     : public DeviceContext,
diff --git a/paddle/phi/backends/gpu/gpu_decls.h b/paddle/phi/backends/gpu/gpu_decls.h
index 4a6b9d2fd87f13..e791326d71fd49 100644
--- a/paddle/phi/backends/gpu/gpu_decls.h
+++ b/paddle/phi/backends/gpu/gpu_decls.h
@@ -16,57 +16,66 @@
 #pragma once
 
 #include "paddle/phi/backends/gpu/forwards.h"
-
+// #include "mudnn/export/c/mudnn_compatible.h"
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
-
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
 #else  // PADDLE_WITH_CDUA
-
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
 #endif
 
-DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t);
-DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t);
+DECLARE_TYPE_FOR_GPU(gpuStream_t, cudaStream_t, hipStream_t,musaStream_t);
+DECLARE_TYPE_FOR_GPU(gpuEvent_t, cudaEvent_t, hipEvent_t,musaEvent_t);
 
-DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
-                     cudnnActivationStruct,
-                     miopenActivationDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
-                     cudnnTensorStruct,
-                     miopenTensorDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
-                     cudnnFilterStruct,
-                     miopenTensorDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
-                     cudnnFilterDescriptor_t,
-                     miopenTensorDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
-                     cudnnConvolutionStruct,
-                     miopenConvolutionDescriptor);
-DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
-                     cudnnConvolutionDescriptor_t,
-                     miopenConvolutionDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
-                     cudnnPoolingDescriptor_t,
-                     miopenPoolingDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
-                     cudnnDropoutDescriptor_t,
-                     miopenDropoutDescriptor_t);
-DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t);
+// DECLARE_TYPE_FOR_GPU(dnnActivationDescriptor,
+//                      cudnnActivationStruct,
+//                      miopenActivationDescriptor,
+//                      mudnnActivationStruct);
+// DECLARE_TYPE_FOR_GPU(dnnTensorDescriptor,
+//                      cudnnTensorStruct,
+//                      miopenTensorDescriptor,
+//                      mudnnTensorStruct);
+// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor,
+//                      cudnnFilterStruct,
+//                      miopenTensorDescriptor,
+//                      mudnnFilterStruct);
+// DECLARE_TYPE_FOR_GPU(dnnFilterDescriptor_t,
+//                      cudnnFilterDescriptor_t,
+//                      miopenTensorDescriptor_t,
+//                      mudnnFilterDescriptor_t);
+// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor,
+//                      cudnnConvolutionStruct,
+//                      miopenConvolutionDescriptor,
+//                      mudnnConvolutionStruct);
+// DECLARE_TYPE_FOR_GPU(dnnConvolutionDescriptor_t,
+//                      cudnnConvolutionDescriptor_t,
+//                      miopenConvolutionDescriptor_t,
+//                      mudnnConvolutionDescriptor_t);
+// DECLARE_TYPE_FOR_GPU(dnnPoolingDescriptor_t,
+//                      cudnnPoolingDescriptor_t,
+//                      miopenPoolingDescriptor_t,
+//                      mudnnPoolingDescriptor_t);
+// DECLARE_TYPE_FOR_GPU(dnnDropoutDescriptor_t,
+//                      cudnnDropoutDescriptor_t,
+//                      miopenDropoutDescriptor_t,
+//                      mudnnDropoutDescriptor_t);
+DECLARE_TYPE_FOR_GPU(dnnHandle_t, cudnnHandle_t, miopenHandle_t,mudnnHandle_t);
 
-DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle);
+DECLARE_TYPE_FOR_GPU(blasHandle_t, cublasHandle_t, rocblas_handle,mublasHandle_t);
 
 // TODO(Ming Huang): Since there is no blasLt handler,
 // use rocblas_handle for workround.
-DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
+// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle, mublasHandle_t);
 
-DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle);
+// DECLARE_TYPE_FOR_GPU(solverHandle_t, cusolverDnHandle_t, rocsolver_handle, musolverDnHandle_t);
 
-DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle);
+DECLARE_TYPE_FOR_GPU(sparseHandle_t, cusparseHandle_t, rocsparse_handle, musparseHandle_t);
 
 #undef DECLARE_TYPE_FOR_GPU
 
diff --git a/paddle/phi/backends/gpu/gpu_device_function.h b/paddle/phi/backends/gpu/gpu_device_function.h
index 0f79e2a645ab34..5c0c475b140ff0 100644
--- a/paddle/phi/backends/gpu/gpu_device_function.h
+++ b/paddle/phi/backends/gpu/gpu_device_function.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_device_function.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/musa_device_function.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_device_function.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_dnn.h b/paddle/phi/backends/gpu/gpu_dnn.h
index f37afa3deeb746..30cf3fae80519b 100644
--- a/paddle/phi/backends/gpu/gpu_dnn.h
+++ b/paddle/phi/backends/gpu/gpu_dnn.h
@@ -14,11 +14,14 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/miopen_desc.h"
 #include "paddle/phi/backends/gpu/rocm/miopen_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/mudnn_desc.h"
+#include "paddle/phi/backends/gpu/musa/mudnn_helper.h"
 #else  // CUDA
 #include "paddle/phi/backends/gpu/cuda/cudnn_desc.h"
 #include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
diff --git a/paddle/phi/backends/gpu/gpu_helper.h b/paddle/phi/backends/gpu/gpu_helper.h
index 2353b42794ffdd..8afa826408cb7a 100644
--- a/paddle/phi/backends/gpu/gpu_helper.h
+++ b/paddle/phi/backends/gpu/gpu_helper.h
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/gpu/rocm/rocm_helper.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/gpu/musa/musa_helper.h"
 #else
 #include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index ebf57bd06eb19d..2d1b7c1a98f27f 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #include <stddef.h>
 
diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h
index fd712baf754803..4e300a3031a258 100644
--- a/paddle/phi/backends/gpu/gpu_launch_config.h
+++ b/paddle/phi/backends/gpu/gpu_launch_config.h
@@ -16,10 +16,12 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_runtime.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
 #else
 #include <hip/hip_runtime.h>
 #endif
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index b9c49cb5696633..03c33a221c4d3e 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -16,6 +16,10 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #endif
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -143,7 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
       static_cast<unsigned long long int>(val));            // NOLINT
 }
 
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
+#if defined(__HIPCC__)  || defined(__MUSACC__)|| (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
@@ -576,7 +580,7 @@ USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__)
 USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
@@ -762,7 +766,7 @@ USE_CUDA_ATOMIC(Min, int);
 USE_CUDA_ATOMIC(Min, unsigned int);
 // CUDA API uses unsigned long long int, we cannot use uint64_t here.
 // It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
+#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) || defined(__MUSACC__)
 USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
 #else
 CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index a29b5e110922a4..89471ba29aee00 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -37,6 +37,10 @@
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/musparse.h"
+#endif
+
 #include "glog/logging.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -64,10 +68,9 @@ void InitGpuProperties(Place place,
   *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId());
   *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
 
+#ifdef PADDLE_WITH_CUDA
   const gpuDeviceProp& prop =
       backends::gpu::GetDeviceProperties(place.GetDeviceId());
-
-#ifdef PADDLE_WITH_CUDA
   static const std::set<int> compiled_archs{CUDA_REAL_ARCHS};
   // Make sure compiled cuda arch is as same as runtime cuda arch.
   if (compiled_archs.find(*compute_capability) == compiled_archs.cend() &&
@@ -115,6 +118,17 @@ void InitGpuProperties(Place place,
   }
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+  LOG_FIRST_N(INFO, 1) << "Please NOTE: device: "
+                       << static_cast<int>(place.device)
+                       << ", GPU Compute Capability: "
+                       << *compute_capability / 10 << "."
+                       << *compute_capability % 10
+                       << ", Driver API Version: " << *driver_version / 10000
+                       << "." << (*driver_version % 10000) / 100
+                       << ", Runtime API Version: " << *runtime_version / 10000
+                       << "." << (*runtime_version % 10000) / 100;
+#else
   // TODO(wilber): glog may be replaced in the future?
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
                           << static_cast<int>(place.device)
@@ -126,6 +140,7 @@ void InitGpuProperties(Place place,
                           << ", Runtime API Version: "
                           << *runtime_version / 1000 << "."
                           << (*runtime_version % 100) / 10;
+#endif
 #ifdef PADDLE_WITH_HIP
   size_t miopen_major, miopen_minor, miopen_patch;
   PADDLE_ENFORCE_GPU_SUCCESS(
@@ -144,42 +159,62 @@ void InitGpuProperties(Place place,
         << "Please recompile or reinstall Paddle with compatible MIOPEN "
            "version.";
   }
+#elif defined(PADDLE_WITH_MUSA)
+  // TODO(@caizhi): mudnnGetVersion is not supported for MUSA now.
+  // Requests have been submitted to Mudnn.
+  // size_t mudnn_dso_ver = dynload::mudnnGetVersion();
+  size_t mudnn_dso_ver = 2500;
+  LOG_FIRST_N(INFO, 1) << "device: " << static_cast<int>(place.device)
+                       << ", muDNN Version: " << mudnn_dso_ver / 1000 << "."
+                       << (mudnn_dso_ver % 1000) / 100 << ".";
+
+  // Check MUSA/MUDNN version compatiblity
+  auto local_musa_version = *driver_version;
+  int compile_musa_version = MUSA_VERSION;
+#if defined(__linux__)
+  PADDLE_ENFORCE_EQ(
+      (local_musa_version / 100 < compile_musa_version / 100) &&
+          (mudnn_dso_ver / 1000 < MUDNN_VERSION / 1000),
+      false,
+      phi::errors::InvalidArgument(
+          "The installed Paddle is compiled with MUSA%d/muDNN%d,"
+          "but MUSA/muDNN version in your machine is MUSA%d/muDNN%d. "
+          "which will cause serious incompatible bug. "
+          "Please recompile or reinstall Paddle with compatible MUSA/muDNN "
+          "version.",
+          compile_musa_version / 10000,
+          MUDNN_VERSION / 1000,
+          local_musa_version / 10000,
+          mudnn_dso_ver / 1000));
+#endif
+  if (local_musa_version < compile_musa_version) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with MUSA "
+        << compile_musa_version / 10000 << "."
+        << (compile_musa_version % 1000) / 100
+        << ", but MUSA runtime version in your machine is "
+        << local_musa_version / 10000 << "."
+        << (local_musa_version % 1000) / 100
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible MUSA "
+           "version.";
+  }
 #else
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-  auto get_cudnn_major = [](auto version) {
-    if (version < 9000) {
-      return version / 1000;
-    }
-    // CUDNN changes the CUDNN_VERSION rules after 9.0
-    return version / 10000;
-  };
-  auto get_cudnn_minor = [](auto version) {
-    if (version < 9000) {
-      return (version % 1000) / 100;
-    }
-    // CUDNN changes the CUDNN_VERSION rules after 9.0
-    return (version % 10000) / 100;
-  };
-
   LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
-                          << ", cuDNN Version: "
-                          << get_cudnn_major(cudnn_dso_ver) << "."
-                          << get_cudnn_minor(cudnn_dso_ver) << ".";
+                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
+                          << (cudnn_dso_ver % 1000) / 100 << ".";
 
   // Check CUDA/CUDNN version compatiblity
   auto local_cuda_version =
       (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
   auto compile_cuda_version =
       (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-
-  // Compute cuDNN major
-  auto local_cudnn_major = get_cudnn_major(cudnn_dso_ver);
-  size_t compile_cudnn_major = CUDNN_MAJOR;
-
 #if defined(__linux__)
   PADDLE_ENFORCE_EQ(
       (local_cuda_version / 10 < compile_cuda_version / 10) &&
-          (local_cudnn_major < compile_cudnn_major),
+          (cudnn_dso_ver / 1000 < CUDNN_VERSION / 1000),
       false,
       phi::errors::InvalidArgument(
           "The installed Paddle is compiled with CUDA%d/cuDNN%d,"
@@ -188,9 +223,9 @@ void InitGpuProperties(Place place,
           "Please recompile or reinstall Paddle with compatible CUDA/cuDNN "
           "version.",
           compile_cuda_version / 10,
-          compile_cudnn_major,
+          CUDNN_VERSION / 1000,
           local_cuda_version / 10,
-          local_cudnn_major));
+          cudnn_dso_ver / 1000));
 #endif
   if (local_cuda_version < compile_cuda_version) {
     LOG_FIRST_N(WARNING, 1)
@@ -206,10 +241,14 @@ void InitGpuProperties(Place place,
 #endif
 }
 
+
 void InitStream(gpuStream_t* stream) {
 #ifdef PADDLE_WITH_HIP
   PADDLE_ENFORCE_GPU_SUCCESS(
       hipStreamCreateWithPriority(stream, hipStreamDefault, 0));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaStreamCreateWithPriority(stream, musaStreamDefault, 0));      
 #else
   PADDLE_ENFORCE_GPU_SUCCESS(
       cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0));
@@ -220,6 +259,8 @@ void DestoryStream(gpuStream_t stream) {
   if (stream != nullptr) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
 #endif
@@ -231,6 +272,9 @@ void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
   phi::dynload::rocblas_create_handle(blas_handle);
   phi::dynload::rocblas_set_stream(*blas_handle, stream);
+#elif defined(PADDLE_WITH_MUSA)
+  phi::dynload::mublasCreate(blas_handle);
+  phi::dynload::mublasSetStream(*blas_handle, stream);
 #else   // PADDLE_WITH_CUDA
   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
   PADDLE_RETRY_CUDA_SUCCESS(
@@ -244,6 +288,11 @@ void DestroyBlasHandle(blasHandle_t handle) {
     phi::dynload::rocblas_destroy_handle(handle);
     handle = nullptr;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (handle != nullptr) {
+    phi::dynload::mublasDestroy(handle);
+    handle = nullptr;
+  }
 #else
   if (handle != nullptr) {
     phi::dynload::cublasDestroy(handle);
@@ -252,20 +301,20 @@ void DestroyBlasHandle(blasHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
-void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-  phi::dynload::cublasLtCreate(blaslt_handle);
-#endif
-}
+// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
+// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+//   phi::dynload::cublasLtCreate(blaslt_handle);
+// #endif
+// }
 
-void DestroyBlasLtHandle(blasLtHandle_t handle) {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-  if (handle != nullptr) {
-    phi::dynload::cublasLtDestroy(handle);
-    handle = nullptr;
-  }
-#endif
-}
+// void DestroyBlasLtHandle(blasLtHandle_t handle) {
+// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+//   if (handle != nullptr) {
+//     phi::dynload::cublasLtDestroy(handle);
+//     handle = nullptr;
+//   }
+// #endif
+// }
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
   if (phi::dynload::HasCUDNN()) {
@@ -289,6 +338,9 @@ void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
     PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
+#elif defined(PADDLE_WITH_MUSA)
+    phi::dynload::mudnnCreate(handle, place.device);
+    phi::dynload::mudnnSetStream(*handle, stream);
 #else
     auto version = phi::dynload::cudnnGetVersion();
     auto local_cudnn_major =
@@ -319,6 +371,11 @@ void DestroyDnnHandle(dnnHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
     handle = nullptr;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (handle != nullptr) {
+    phi::dynload::mudnnDestroy(handle);
+    handle = nullptr;
+  }  
 #else
   if (handle != nullptr) {
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
@@ -327,21 +384,21 @@ void DestroyDnnHandle(dnnHandle_t handle) {
 #endif  // PADDLE_WITH_HIP
 }
 
-void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
-#ifndef PADDLE_WITH_HIP
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
-#endif
-}
+// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
+// #if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+//   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
+//   PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
+// #endif
+// }
 
-void DestroySolverHandle(solverHandle_t solver_handle) {
-#ifndef PADDLE_WITH_HIP
-  if (solver_handle != nullptr) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
-    solver_handle = nullptr;
-  }
-#endif
-}
+// void DestroySolverHandle(solverHandle_t solver_handle) {
+// #if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
+//   if (solver_handle != nullptr) {
+//     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
+//     solver_handle = nullptr;
+//   }
+// #endif
+// }
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 // ROCM is not yet supported
@@ -354,6 +411,9 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 #elif defined(PADDLE_WITH_HIP)
   phi::dynload::rocsparse_create_handle(handle);
   phi::dynload::rocsparse_set_stream(*handle, stream);
+#elif defined(PADDLE_WITH_MUSA)
+  phi::dynload::musparseCreateHandle(handle);
+  phi::dynload::musparseSetStream(*handle, stream);  
 #endif
 }
 
@@ -370,6 +430,11 @@ void DestroySparseHandle(sparseHandle_t handle) {
     phi::dynload::rocsparse_destroy_handle(handle);
     handle = nullptr;
   }
+#elif defined(PADDLE_WITH_MUSA)
+  if (handle != nullptr) {
+    phi::dynload::musparseDestroyHandle(handle);
+    handle = nullptr;
+  }
 #endif
 }
 
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
index 7bec5eebf5886f..df6a131ff315d7 100644
--- a/paddle/phi/backends/gpu/gpu_resources.h
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -35,14 +35,14 @@ void DestoryStream(gpuStream_t stream);
 void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream);
 void DestroyBlasHandle(blasHandle_t handle);
 
-void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
-void DestroyBlasLtHandle(blasLtHandle_t handle);
+// void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
+// void DestroyBlasLtHandle(blasLtHandle_t handle);
 
 void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
 void DestroyDnnHandle(dnnHandle_t handle);
 
-void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
-void DestroySolverHandle(solverHandle_t solver_handle);
+// void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
+// void DestroySolverHandle(solverHandle_t solver_handle);
 
 void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream);
 void DestroySparseHandle(sparseHandle_t handle);
diff --git a/paddle/phi/backends/gpu/gpu_types.h b/paddle/phi/backends/gpu/gpu_types.h
index 77f403795b6b3d..00c0bdf6c545bc 100644
--- a/paddle/phi/backends/gpu/gpu_types.h
+++ b/paddle/phi/backends/gpu/gpu_types.h
@@ -17,11 +17,15 @@
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/miopen.h"
 #include "paddle/phi/backends/dynload/rocblas.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
 #else  // PADDLE_WITH_CUDA
 #include "paddle/phi/backends/dynload/cublas.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
@@ -30,18 +34,39 @@
 namespace phi {
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
   using GPU_TYPE = ROCM_TYPE;
 
-#else  // PADDLE_WITH_CDUA
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = MUSA_TYPE;
+
+#else  // PADDLE_WITH_MUSA
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE, MUSA_TYPE) \
+  using GPU_TYPE = CUDA_TYPE;
+#endif  // PADDLE_WITH_CUDA
+
+DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t, musaError_t);
+DECLARE_TYPE_FOR_GPU(gpuMemcpyKind,
+                     cudaMemcpyKind,
+                     hipMemcpyKind,
+                     musaMemcpyKind);
+DECLARE_TYPE_FOR_GPU(gpuDeviceProp,
+                     cudaDeviceProp,
+                     hipDeviceProp_t,
+                     musaDeviceProp);
+#undef DECLARE_TYPE_FOR_GPU
+
+#ifndef PADDLE_WITH_MUSA
+#ifdef PADDLE_WITH_HIP
+#define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
+  using GPU_TYPE = ROCM_TYPE;
 
+#else  // PADDLE_WITH_MUSA
 #define DECLARE_TYPE_FOR_GPU(GPU_TYPE, CUDA_TYPE, ROCM_TYPE) \
   using GPU_TYPE = CUDA_TYPE;
-#endif
+#endif  // PADDLE_WITH_CUDA
 
-DECLARE_TYPE_FOR_GPU(gpuError_t, cudaError_t, hipError_t);
-DECLARE_TYPE_FOR_GPU(gpuMemcpyKind, cudaMemcpyKind, hipMemcpyKind);
-DECLARE_TYPE_FOR_GPU(gpuDeviceProp, cudaDeviceProp, hipDeviceProp_t);
 DECLARE_TYPE_FOR_GPU(dnnDataType_t, cudnnDataType_t, miopenDataType_t);
 DECLARE_TYPE_FOR_GPU(dnnPoolingMode_t, cudnnPoolingMode_t, miopenPoolingMode_t);
 DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
@@ -50,34 +75,45 @@ DECLARE_TYPE_FOR_GPU(dnnTensorFormat_t,
 DECLARE_TYPE_FOR_GPU(dnnActivationMode_t,
                      cudnnActivationMode_t,
                      miopenActivationMode_t);
-
 #undef DECLARE_TYPE_FOR_GPU
+#endif
 
 #ifdef PADDLE_WITH_HIP
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = ROCM_CV;
+#elif defined(PADDLE_WITH_MUSA)
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
+  constexpr auto GPU_CV = MUSA_CV;
 #else  // PADDLE_WITH_CUDA
-#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV) \
+#define DECLARE_CONSTANT_FOR_GPU(GPU_CV, CUDA_CV, ROCM_CV, MUSA_CV) \
   constexpr auto GPU_CV = CUDA_CV;
 #endif
 
 DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
                          cudaErrorMemoryAllocation,
-                         hipErrorOutOfMemory);
-DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
-DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
+                         hipErrorOutOfMemory,
+                         musaErrorMemoryAllocation);
+DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady,
+                         cudaErrorNotReady,
+                         hipErrorNotReady,
+                         musaErrorNotReady);
+DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess, musaSuccess);
 
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice,
                          cudaMemcpyKind::cudaMemcpyHostToDevice,
-                         hipMemcpyKind::hipMemcpyHostToDevice);
+                         hipMemcpyKind::hipMemcpyHostToDevice,
+                         musaMemcpyKind::musaMemcpyHostToDevice);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost,
                          cudaMemcpyKind::cudaMemcpyDeviceToHost,
-                         hipMemcpyKind::hipMemcpyDeviceToHost);
+                         hipMemcpyKind::hipMemcpyDeviceToHost,
+                         musaMemcpyKind::musaMemcpyDeviceToHost);
 DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
                          cudaMemcpyKind::cudaMemcpyDeviceToDevice,
-                         hipMemcpyKind::hipMemcpyDeviceToDevice);
+                         hipMemcpyKind::hipMemcpyDeviceToDevice,
+                         musaMemcpyKind::musaMemcpyDeviceToDevice);
 
 #undef DECLARE_CONSTANT_FOR_GPU
 }  // namespace phi
 
-#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#endif  // defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ||
+        // defined(PADDLE_WITH_MUSA )
diff --git a/paddle/phi/backends/gpu/musa/mudnn_desc.h b/paddle/phi/backends/gpu/musa/mudnn_desc.h
new file mode 100644
index 00000000000000..9de12d586bea01
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/mudnn_desc.h
@@ -0,0 +1,202 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+#include "paddle/phi/backends/gpu/musa/mudnn_helper.h"
+#include "paddle/phi/core/utils/data_type.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+template <typename T>
+inline std::vector<T> TransformDimOrder(const std::vector<T>& dims) {
+  std::vector<T> transformed_dims(dims.begin(), dims.end());
+  if (dims.size() < 4) {
+    return transformed_dims;
+  }
+  T H, W, D, C;
+  if (dims.size() == 4) {
+    H = dims[1];
+    W = dims[2];
+    C = dims[3];
+    transformed_dims[1] = C;
+    transformed_dims[2] = H;
+    transformed_dims[3] = W;
+  } else {
+    D = dims[1];
+    H = dims[2];
+    W = dims[3];
+    C = dims[4];
+    transformed_dims[1] = C;
+    transformed_dims[2] = D;
+    transformed_dims[3] = H;
+    transformed_dims[4] = W;
+  }
+  return transformed_dims;
+}
+
+inline dynload::Tensor::Type ToCudnnDataType(const phi::DataType& t) {
+  dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT;
+  switch (t) {
+    case phi::DataType::FLOAT16:
+      type = dynload::Tensor::Type::HALF;
+      break;
+    case phi::DataType::FLOAT32:
+      type = dynload::Tensor::Type::FLOAT;
+      break;
+    case phi::DataType::FLOAT64:
+      type = dynload::Tensor::Type::DOUBLE;
+      break;
+    default:
+      PD_THROW("Don't support this data type ", t);
+  }
+  return type;
+}
+
+class TensorDescriptor {
+ public:
+  using T = dynload::Tensor;
+  TensorDescriptor() : desc_(std::make_unique<T>()) {}
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+  void set(const phi::DenseTensor& tensor, const int groups = 1) {
+    auto dims = phi::vectorize<int64_t>(tensor.dims());
+    std::vector<int64_t> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    desc_->SetType(ToCudnnDataType(tensor.dtype()));
+    desc_->SetNdInfo(static_cast<int>(dims.size()), dims.data(), strides.data());
+    desc_->SetAddr(tensor.data());
+  }
+
+  template <typename Type>
+  void set(const phi::DenseTensor& tensor, const Type* data) {
+    auto dims = phi::vectorize<int64_t>(tensor.dims());
+    std::vector<int64_t> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    desc_->SetType(ToCudnnDataType(tensor.dtype()));
+    desc_->SetNdInfo(static_cast<int>(dims.size()), dims.data(), strides.data());
+    desc_->SetAddr(data);
+  }
+
+  void set(const std::vector<int>& dims,
+           const dynload::Tensor::Format format,
+           const dynload::Tensor::Type dtype) {
+    std::vector<int64_t> transformed_dims;
+    std::vector<int64_t> dims_64(dims.begin(), dims.end());
+    if (format == dynload::Tensor::Format::NHWC) {
+      transformed_dims = TransformDimOrder(dims_64);
+    } else {
+      transformed_dims = dims_64;
+    }
+    desc_->SetFormat(format);
+    desc_->SetType(dtype);
+    desc_->SetNdInfo(static_cast<int>(transformed_dims.size()), transformed_dims.data());
+  }
+
+  void set(const phi::DenseTensor& tensor,
+           const dynload::Tensor::Format format) {
+    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dtype = ToCudnnDataType(tensor.dtype());
+    set(dims, format, dtype);
+    desc_->SetAddr(tensor.data());
+  }
+
+ private:
+  std::unique_ptr<T> desc_;
+};
+
+class FilterDescriptor {
+ public:
+  using T = phi::dynload::Tensor;
+  FilterDescriptor() : desc_(std::make_unique<T>()) {}
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+  void set(const std::vector<int>& dims,
+           const dynload::Tensor::Format format,
+           const dynload::Tensor::Type dtype,
+           const int groups = 1) {
+    std::vector<int64_t> transformed_dims;
+    std::vector<int64_t> dims_64(dims.begin(), dims.end());
+    if (format == dynload::Tensor::Format::NHWC) {
+      transformed_dims = TransformDimOrder(dims_64);
+    } else {
+      transformed_dims = dims_64;
+    }
+    if (groups > 1) {
+      transformed_dims[1] = transformed_dims[1] / groups;
+    }
+    desc_->SetFormat(format);
+    desc_->SetType(dtype);
+    desc_->SetNdInfo(static_cast<int>(transformed_dims.size()), transformed_dims.data());
+  }
+
+  void set(const phi::DenseTensor& tensor,
+           const dynload::Tensor::Format format,
+           const int groups = 1) {
+    auto dims = phi::vectorize<int>(tensor.dims());
+    auto dtype = ToCudnnDataType(tensor.dtype());
+    set(dims, format, dtype, groups);
+    desc_->SetAddr(tensor.data());
+  }
+
+ private:
+  std::unique_ptr<T> desc_;
+};
+
+class ConvolutionDescriptor {
+ public:
+  using T = dynload::Convolution;
+  ConvolutionDescriptor() : desc_(std::make_unique<T>()) {}
+  T* desc() { return desc_.get(); }
+  T* desc() const { return desc_.get(); }
+
+  void set(dynload::Tensor::Type dtype,
+           const std::vector<int>& pads,
+           const std::vector<int>& strides,
+           const std::vector<int>& dilations,
+           bool allow_tf32,
+           const int groups = 1) {
+    allow_tf32_ = allow_tf32;
+    desc_->SetNdInfo(
+        pads.size(), pads.data(), strides.data(), dilations.data());
+    desc_->SetComputeMode(dynload::Convolution::ComputeMode::TENSOR);
+    desc_->SetGroups(groups);
+  }
+
+  bool allow_tf32_;
+
+ private:
+  std::unique_ptr<T> desc_;
+};
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/mudnn_helper.h b/paddle/phi/backends/gpu/musa/mudnn_helper.h
new file mode 100644
index 00000000000000..55030e860b4213
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/mudnn_helper.h
@@ -0,0 +1,323 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+
+#define CUDNN_BN_MIN_EPSILON 1e-05
+
+DECLARE_bool(cudnn_deterministic);
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+
+enum class DataLayout {  // Not use
+  kNHWC,
+  kNCHW,
+  kNCDHW,
+  kNDHWC,  // add, liyamei
+  kNCHW_VECT_C,
+};
+
+enum class PoolingMode {
+  kMaximum,
+  kMaximumDeterministic,
+  kAverageExclusive,
+  kAverageInclusive,
+};
+
+inline dynload::Pooling::Mode GetPoolingMode(const PoolingMode& mode) {
+  switch (mode) {
+    // case PoolingMode::kMaximumDeterministic:
+    //   return CUDNN_POOLING_MAX_DETERMINISTIC;
+    case PoolingMode::kAverageExclusive:
+      return dynload::Pooling::Mode::AVGPOOL_COUNT_WITHOUT_PAD;
+    case PoolingMode::kAverageInclusive:
+      return dynload::Pooling::Mode::AVGPOOL_COUNT_PAD;
+    case PoolingMode::kMaximum:
+      return dynload::Pooling::Mode::MAXPOOL;
+    default:
+      PADDLE_THROW(
+          phi::errors::Unimplemented("Unexpected MUDNN pooling mode."));
+  }
+}
+
+template <typename T>
+class CudnnDataType;
+
+template <>
+class CudnnDataType<phi::dtype::bfloat16> {
+ public:
+  static const dynload::Tensor::Type type = dynload::Tensor::Type::BFLOAT16;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<phi::dtype::float16> {
+ public:
+  static const dynload::Tensor::Type type = dynload::Tensor::Type::HALF;
+  // The scaling param type is float for HALF and FLOAT tensors
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<float> {
+ public:
+  static const dynload::Tensor::Type type = dynload::Tensor::Type::FLOAT;
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<double> {
+ public:
+  static const dynload::Tensor::Type type = dynload::Tensor::Type::DOUBLE;
+  using ScalingParamType = const double;
+  using BatchNormParamType = double;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+inline dynload::Tensor::Format GetCudnnTensorFormat(
+    const DataLayout& order) {  // Not use
+  switch (order) {
+    case DataLayout::kNHWC:
+      return dynload::Tensor::Format::NHWC;
+    case DataLayout::kNCHW:
+      return dynload::Tensor::Format::NCHW;
+    case DataLayout::kNCDHW:
+      return dynload::Tensor::Format::NCDHW;
+    case DataLayout::kNDHWC:
+      return dynload::Tensor::Format::NDHWC;
+    default:
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "MUDNN has no equivalent dataLayout for input order."));
+  }
+  return dynload::Tensor::Format::NCHW;
+}
+
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor() {}
+  ~ScopedTensorDescriptor() PADDLE_MAY_THROW {}
+
+  inline dynload::Tensor descriptor(const dynload::Tensor::Format format,
+                                    const dynload::Tensor::Type type,
+                                    const std::vector<int>& dims,
+                                    const int groups = 1) {
+    // the format is not used now, will add later
+    std::vector<int64_t> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    // Update tensor descriptor dims setting if groups > 1
+    // NOTE: Here, Assume using NCHW or NCDHW order
+    std::vector<int64_t> dims_with_group(dims.begin(), dims.end());
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+
+    PADDLE_ENFORCE_EQ(
+        format,
+        dynload::Tensor::Format::NCHW,
+        phi::errors::InvalidArgument("format should ONLY be NCHW in MUDNN."));
+
+    desc_.SetNdInfo(
+        static_cast<int>(dims_with_group.size()), dims_with_group.data(), strides.data());
+    desc_.SetType(type);
+    desc_.SetFormat(format);
+
+    return desc_;
+  }
+
+  template <typename T>
+  inline dynload::Tensor& descriptor(const DataLayout& order,
+                                     const std::vector<int>& dims,
+                                     const int groups = 1) {
+    descriptor(
+        GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims, groups);
+    return desc_;
+  }
+
+  template <typename T>
+  inline dynload::Tensor& descriptor(const phi::DenseTensor& tensor,
+                                     const DataLayout& order,
+                                     const std::vector<int>& dims,
+                                     const int groups = 1) {
+    desc_.SetAddr(tensor.data());
+    descriptor<T>(order, dims, groups);
+    return desc_;
+  }
+
+  template <typename T>
+  inline dynload::Tensor& descriptor(const T* data,
+                                     const DataLayout& order,
+                                     const std::vector<int>& dims,
+                                     const int groups = 1) {
+    desc_.SetAddr(data);
+    descriptor<T>(order, dims, groups);
+    return desc_;
+  }
+
+  inline dynload::Tensor& descriptor(const dynload::Tensor::Type mudnn_type,
+                                     const std::vector<int>& dim,
+                                     const std::vector<int>& stride) {
+    std::vector<int64_t> dims_64(dim.begin(), dim.end());
+    std::vector<int64_t> stride_64(dim.begin(), dim.end());
+    desc_.SetType(mudnn_type);
+    desc_.SetNdInfo(static_cast<int>(dims_64.size()), dims_64.data(), stride_64.data());
+    return desc_;
+  }
+
+  template <typename T>
+  inline dynload::Tensor& descriptor(const std::vector<int>& dim,
+                                     const std::vector<int>& stride) {
+    descriptor(CudnnDataType<T>::type, dim, stride);
+    return desc_;
+  }
+
+  inline dynload::Tensor& desc() { return desc_; }
+
+ private:
+  dynload::Tensor desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor() {}
+  ~ScopedPoolingDescriptor() PADDLE_MAY_THROW {}
+
+  inline dynload::Pooling& descriptor(const PoolingMode& mode,
+                                      const std::vector<int>& kernel,
+                                      const std::vector<int>& pads,
+                                      const std::vector<int>& strides) {
+    PADDLE_ENFORCE_EQ(kernel.size(),
+                      pads.size(),
+                      phi::errors::InvalidArgument(
+                          "The size of kernel and pads should be equal. But "
+                          "received size of kernel is %d, size of pads is %d.",
+                          kernel.size(),
+                          pads.size()));
+    PADDLE_ENFORCE_EQ(
+        kernel.size(),
+        strides.size(),
+        phi::errors::InvalidArgument(
+            "The size of kernel and strides should be equal. But "
+            "received size of kernel is %d, size of strides is %d.",
+            kernel.size(),
+            strides.size()));
+    const std::vector<int> dilation(kernel.size(), 1);
+    desc_.SetNdInfo(kernel.size(),
+                    kernel.data(),
+                    pads.data(),
+                    strides.data(),
+                    dilation.data());
+    desc_.SetMode(GetPoolingMode(mode));
+    return desc_;
+  }
+
+  dynload::Pooling& desc() { return desc_; }
+
+ private:
+  dynload::Pooling desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+class ScopedSoftmaxDescriptor {
+ public:
+  ScopedSoftmaxDescriptor() {}
+  ~ScopedSoftmaxDescriptor() PADDLE_MAY_THROW {}
+
+  inline dynload::Softmax& descriptor(const dynload::Softmax::Mode& mode,
+                                      const dynload::Softmax::Algorithm& algo,
+                                      const int& dim) {
+    desc_.SetMode(mode);
+    desc_.SetDim(dim);
+    desc_.SetAlgorithm(algo);
+    return desc_;
+  }
+
+  dynload::Softmax& desc() { return desc_; }
+
+ private:
+  dynload::Softmax desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedSoftmaxDescriptor);
+};
+
+static void InternalMemFree(void* ptr) {
+  if (!ptr) {
+    return;
+  }
+  PADDLE_ENFORCE_GPU_SUCCESS(musaFree(ptr));
+}
+
+static dynload::MemoryHandler InternalMemAlloc(size_t s) {
+  void* data = nullptr;
+  if (s) {
+    PADDLE_ENFORCE_GPU_SUCCESS(musaMalloc(&data, s));
+  }
+  return dynload::MemoryHandler(data, InternalMemFree);
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_device_function.h b/paddle/phi/backends/gpu/musa/musa_device_function.h
new file mode 100644
index 00000000000000..f2847daf4dfacb
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_device_function.h
@@ -0,0 +1,193 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define PADDLE_CUDA_FP16
+// NOTE(): support float16 to half in header file.
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+#define FULL_WARP_MASK 0xFFFFFFFF
+#define CREATE_SHFL_MASK(mask, predicate) \
+  mask = __ballot_sync(FULL_WARP_MASK, (predicate))
+
+#define CUDA_LAUNCH_KERNEL_BASE(dim, ...)  \
+  case (dim): {                            \
+    constexpr auto kPowerOfTwoDim = (dim); \
+    __VA_ARGS__;                           \
+  } break
+
+#define CUDA_LAUNCH_KERNEL_HELPER(...)          \
+  CUDA_LAUNCH_KERNEL_BASE(1024, ##__VA_ARGS__); \
+  CUDA_LAUNCH_KERNEL_BASE(512, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(256, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(128, ##__VA_ARGS__);  \
+  CUDA_LAUNCH_KERNEL_BASE(64, ##__VA_ARGS__);   \
+  CUDA_LAUNCH_KERNEL_BASE(32, ##__VA_ARGS__);
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
+  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
+}
+
+template <typename T>
+__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
+                                                T val,
+                                                int width = warpSize) {
+  return __shfl_xor_sync(mask, val, width);
+}
+
+
+#if defined(PADDLE_WITH_MUSA)
+// Due to the inconsistency between mcc and nvcc, certain type conversions are not implicitly performed, so we specialize here.
+template <>
+__forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(unsigned mask,
+                                                phi::dtype::float16 val,
+                                                int width) {
+  return (phi::dtype::float16)(__shfl_xor_sync(mask, float(val), width));
+}
+#endif
+
+template <>
+__forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::float16 val, int delta, int width) {
+  return phi::dtype::float16(__shfl_down_sync(
+      mask, val.to_half(), static_cast<unsigned>(delta), width));
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
+    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
+#if defined(PADDLE_MUSA_BF16) && defined(__MUSA_ARCH__) && __MUSA_ARCH__ >= 220
+  return phi::dtype::bfloat16(__shfl_down_sync(
+      mask, val.to_mt_bfloat16(), static_cast<unsigned>(delta), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
+  float real = static_cast<float>(__shfl_down_sync(
+      mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
+  float imag = static_cast<float>(__shfl_down_sync(
+      mask, static_cast<float>(val.imag), static_cast<unsigned>(delta), width));
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
+    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
+  double real =
+      static_cast<double>(__shfl_down_sync(mask,
+                                           static_cast<double>(val.real),
+                                           static_cast<unsigned>(delta),
+                                           width));
+  double imag =
+      static_cast<double>(__shfl_down_sync(mask,
+                                           static_cast<double>(val.imag),
+                                           static_cast<unsigned>(delta),
+                                           width));
+  return phi::dtype::complex<double>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
+    unsigned mask, phi::dtype::bfloat16 val, int width) {
+#if defined(PADDLE_MUSA_BF16)
+  return phi::dtype::bfloat16(
+      __shfl_xor_sync(mask, val.to_mt_bfloat16(), width));
+#else
+  PADDLE_ENFORCE(
+      false, "__shfl_xor_sync with bfloat16 is not supported on cuda <= 11.");
+#endif
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<float> val, int width) {
+  float real = static_cast<float>(
+      __shfl_xor_sync(mask, static_cast<float>(val.real), width));
+  float imag = static_cast<float>(
+      __shfl_xor_sync(mask, static_cast<float>(val.imag), width));
+  return phi::dtype::complex<float>(real, imag);
+}
+
+template <>
+__forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
+    unsigned mask, phi::dtype::complex<double> val, int width) {
+  double real = static_cast<double>(
+      __shfl_xor_sync(mask, static_cast<double>(val.real), width));
+  double imag = static_cast<double>(
+      __shfl_xor_sync(mask, static_cast<double>(val.imag), width));
+  return phi::dtype::complex<double>(real, imag);
+}
+
+template <typename T>
+__forceinline__ __device__ T
+CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
+  return __shfl_sync(mask, val, src_line, width);
+}
+
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
+template <typename T>
+__device__ T reduceSum(T val, int tid, int len) {
+  // NOTE(zcd): The warp size should be taken from the
+  // parameters of the GPU but not specified as 32 simply.
+  // To make the reduceSum more efficiently,
+  // I use Warp-Level Parallelism and assume the Warp size
+  // is 32 which may be different for different GPU,
+  // but most card's warp size is 32.
+  const int warpSize = 32;
+  __shared__ T shm[warpSize];
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, tid < len);
+
+  for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+
+  if (tid < warpSize) shm[tid] = 0;
+  __syncthreads();
+
+  if (tid % warpSize == 0) {
+    shm[tid / warpSize] = val;
+  }
+  __syncthreads();
+
+  CREATE_SHFL_MASK(mask, tid < warpSize);
+
+  if (tid < warpSize) {
+    val = shm[tid];
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+      val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_helper.h b/paddle/phi/backends/gpu/musa/musa_helper.h
new file mode 100644
index 00000000000000..7463edc5d9ff60
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_helper.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+/*
+ * Summary: Grid stride looping macro in CUDA kernel
+ *
+ *  [ Why need this macro? ]
+ *
+ *    The original looping in CUDA kernel is:
+ *
+ *    `for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+ *        i += blockDim.x * gridDim.x)`
+ *
+ *    This for condition is risky. The value of `blockIdx.x * blockDim.x`
+ *    may be large, such as over 1GB, the first iteration is no problem here,
+ *    but when `i += blockDim.x * gridDim.x` is executed, the value of i
+ *    will greater than INT_MAX and overflow becomes negative value, at
+ *    this time, the cycle condition `i < (n)` is still satisfied, so it
+ *    will cause illegal access to cuda memory.
+ *
+ *    Here is a real example in ERINE, it will trigger above error.
+ *    The related data are:
+ *      - blockIdx.x = 2172938
+ *      - blockDim.x = 512
+ *      - blockIdx.x * blockDim.x = 1112543864
+ *      - INT_MAX = 2147483647
+ *
+ *    So we polish the for condition as follow, the int64_t __index__ will
+ *    prevent overflow in the loop increment.
+ *
+ * Parameters:
+ *    - i: loop index
+ *    - num: total element numbers
+ *
+ * Examples:
+ *    template <typename T>
+ *    __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
+ *                      const int d, const int remain) {
+ *    CUDA_KERNEL_LOOP(index, num) {
+ *      int idx_n = index / d;
+ *      int idx_remain = index % remain;
+ *      logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+ *      }
+ *    }
+ *
+ */
+
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
new file mode 100644
index 00000000000000..b7f5457415bf80
--- /dev/null
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -0,0 +1,344 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <array>
+#include <mutex>
+
+#include "paddle/fluid/framework/fleet/heter_ps/log_patch.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+
+#include "paddle/phi/core/enforce.h"
+
+#include "musa_runtime.h"
+
+static std::once_flag g_device_props_size_init_flag;
+static std::vector<std::unique_ptr<std::once_flag>> g_device_props_init_flags;
+static std::vector<phi::gpuDeviceProp> g_device_props;
+
+namespace phi {
+namespace backends {
+namespace gpu {
+
+int DnnVersion() {
+  if (!dynload::HasCUDNN()) return -1;
+  // TODO(@caizhi): mudnnGetVersion is not supported now.
+  // version info will be returned from mudnnGetVersion later.
+  const int version_major = 2;
+  const int version_minor = 3;
+  const int version_patch = 0;
+  return version_major * 1000 + version_minor * 100 + version_patch;
+}
+
+static int GetGPUDeviceCountImpl() {
+  int driverVersion = 0;
+  musaError_t status = musaDriverGetVersion(&driverVersion);
+
+  if (!(status == gpuSuccess && driverVersion != 0)) {
+    // No GPU driver
+    VLOG(2) << "GPU Driver Version can't be detected. No GPU driver!";
+    return 0;
+  }
+
+  const auto *musa_visible_devices = std::getenv("MUSA_VISIBLE_DEVICES");
+
+  if (musa_visible_devices != nullptr) {
+    std::string musa_visible_devices_str(musa_visible_devices);
+    if (!musa_visible_devices_str.empty()) {
+      musa_visible_devices_str.erase(
+          0, musa_visible_devices_str.find_first_not_of('\''));
+      musa_visible_devices_str.erase(
+          musa_visible_devices_str.find_last_not_of('\'') + 1);
+      musa_visible_devices_str.erase(
+          0, musa_visible_devices_str.find_first_not_of('\"'));
+      musa_visible_devices_str.erase(
+          musa_visible_devices_str.find_last_not_of('\"') + 1);
+    }
+    if (std::all_of(musa_visible_devices_str.begin(),
+                    musa_visible_devices_str.end(),
+                    [](char ch) { return ch == ' '; })) {
+      VLOG(2) << "MUSA_VISIBLE_DEVICES is set to be "
+                 "empty. No GPU detected.";
+      return 0;
+    }
+  }
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDeviceCount(&count));
+  return count;
+}
+
+int GetGPUDeviceCount() {
+  // cache the count
+  static auto dev_cnt = GetGPUDeviceCountImpl();
+  return dev_cnt;
+}
+
+int GetGPUComputeCapability(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int major, minor;
+  auto major_error_code =
+      musaDeviceGetAttribute(&major, musaDevAttrComputeCapabilityMajor, id);
+  auto minor_error_code =
+      musaDeviceGetAttribute(&minor, musaDevAttrComputeCapabilityMinor, id);
+
+  PADDLE_ENFORCE_GPU_SUCCESS(major_error_code);
+  PADDLE_ENFORCE_GPU_SUCCESS(minor_error_code);
+  return major * 10 + minor;
+}
+
+int GetGPURuntimeVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int runtime_version = 0;
+  // Note: runtime_version = MAJOR * 10000 + MINOR * 100 + PATCH
+  PADDLE_ENFORCE_GPU_SUCCESS(musaRuntimeGetVersion(&runtime_version));
+  return runtime_version;
+}
+
+int GetGPUDriverVersion(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int driver_version = 0;
+  // Note: driver_version = MAJOR * 10000 + MINOR * 100 + PATCH
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDriverGetVersion(&driver_version));
+  return driver_version;
+}
+
+bool TensorCoreAvailable() { return false; }
+
+int GetGPUMultiProcessors(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDevAttrMultiProcessorCount, id));
+  return count;
+}
+
+int GetGPUMaxThreadsPerMultiProcessor(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &count, musaDevAttrMaxThreadsPerMultiProcessor, id));
+
+  return count;
+}
+
+int GetGPUMaxThreadsPerBlock(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  int count;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetAttribute(&count, musaDevAttrMaxThreadsPerBlock, id));
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&device_id));
+  return device_id;
+}
+
+std::array<int, 3> GetGpuMaxGridDimSize(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  std::array<int, 3> ret;
+  int size;
+  auto error_code_x = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimX, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_x);
+  ret[0] = size;
+
+  auto error_code_y = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimY, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_y);
+  ret[1] = size;
+
+  auto error_code_z = musaDeviceGetAttribute(&size, musaDevAttrMaxGridDimZ, id);
+  PADDLE_ENFORCE_GPU_SUCCESS(error_code_z);
+  ret[2] = size;
+  return ret;
+}
+
+std::pair<int, int> GetGpuStreamPriorityRange() {
+  int least_priority, greatest_priority;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+  return std::make_pair(least_priority, greatest_priority);
+}
+
+const gpuDeviceProp &GetDeviceProperties(int id) {
+  std::call_once(g_device_props_size_init_flag, [&] {
+    int gpu_num = 0;
+    gpu_num = GetGPUDeviceCount();
+    g_device_props_init_flags.resize(gpu_num);
+    g_device_props.resize(gpu_num);
+    for (int i = 0; i < gpu_num; ++i) {
+      g_device_props_init_flags[i] = std::make_unique<std::once_flag>();
+    }
+  });
+
+  if (id == -1) {
+    id = GetCurrentDeviceId();
+  }
+
+  if (id < 0 || id >= static_cast<int>(g_device_props.size())) {
+    PADDLE_THROW(phi::errors::OutOfRange(
+        "The device id %d is out of range [0, %d), where %d is the number of "
+        "devices on this machine. Because the device id should be greater than "
+        "or equal to zero and smaller than the number of gpus. Please input "
+        "appropriate device again!",
+        id,
+        static_cast<int>(g_device_props.size()),
+        static_cast<int>(g_device_props.size())));
+  }
+
+  std::call_once(*(g_device_props_init_flags[id]), [&] {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        musaGetDeviceProperties(&g_device_props[id], id));
+  });
+
+  return g_device_props[id];
+}
+
+void SetDeviceId(int id) {
+  PADDLE_ENFORCE_LT(
+      id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   id,
+                                   GetGPUDeviceCount()));
+  PADDLE_RETRY_CUDA_SUCCESS(musaSetDevice(id));
+}
+
+void GpuMemcpyAsync(void *dst,
+                    const void *src,
+                    size_t count,
+                    gpuMemcpyKind kind,
+                    gpuStream_t stream) {
+  // PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
+  std::cout<<"in GpuMemcpyAsync dst:"<<dst<<std::endl;
+  std::cout<<"in GpuMemcpyAsync src:"<<src<<std::endl;  
+  std::cout<<"in GpuMemcpyAsync count:"<<count<<std::endl;  
+  std::cout<<"in GpuMemcpyAsync kind:"<<kind<<std::endl;
+  musaError_t res = musaMemcpyAsync(dst, src, count, kind, stream);
+  std::cout<<"in GpuMemcpyAsync res:"<<res<<std::endl;  
+}
+
+void GpuMemcpySync(void *dst,
+                   const void *src,
+                   size_t count,
+                   gpuMemcpyKind kind) {
+  // PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
+  std::cout<<"in GpuMemcpySync dst:"<<dst<<std::endl;
+  std::cout<<"in GpuMemcpySync src:"<<src<<std::endl;  
+  std::cout<<"in GpuMemcpySync count:"<<count<<std::endl;  
+  std::cout<<"in GpuMemcpySync kind:"<<kind<<std::endl;
+  musaError_t res = musaMemcpy(dst, src, count, kind);
+  std::cout<<"in GpuMemcpySync res:"<<res<<std::endl;
+
+}
+
+void GpuMemcpyPeerAsync(void *dst,
+                        int dst_device,
+                        const void *src,
+                        int src_device,
+                        size_t count,
+                        gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream));
+}
+
+void GpuMemcpyPeerSync(
+    void *dst, int dst_device, const void *src, int src_device, size_t count) {
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      musaMemcpyPeer(dst, dst_device, src, src_device, count));
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemsetAsync(dst, value, count, stream));
+}
+
+void GpuStreamSync(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(stream));
+}
+
+void GpuDestroyStream(gpuStream_t stream) {
+  PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(stream));
+}
+
+void GpuDeviceSync() { PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceSynchronize()); }
+
+gpuError_t GpuGetLastError() { return musaGetLastError(); }
+
+bool IsGPUManagedMemorySupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+  return false;
+}
+
+bool IsGPUManagedMemoryOversubscriptionSupported(int dev_id) {
+  PADDLE_ENFORCE_LT(
+      dev_id,
+      GetGPUDeviceCount(),
+      phi::errors::InvalidArgument("Device id must be less than GPU count, "
+                                   "but received id is: %d. GPU count is: %d.",
+                                   dev_id,
+                                   GetGPUDeviceCount()));
+  return false;
+}
+
+}  // namespace gpu
+}  // namespace backends
+}  // namespace phi
diff --git a/paddle/phi/capi/lib/c_device_context.cc b/paddle/phi/capi/lib/c_device_context.cc
index b415ece7e361d2..6dc1ff768260d7 100644
--- a/paddle/phi/capi/lib/c_device_context.cc
+++ b/paddle/phi/capi/lib/c_device_context.cc
@@ -35,7 +35,7 @@ PD_Stream PD_DeviceContextGetStream(const PD_DeviceContext* ctx,
         reinterpret_cast<const phi::CustomContext*>(ctx)->stream());
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return nullptr;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_Stream>(
         reinterpret_cast<const phi::GPUContext*>(ctx)->stream());
diff --git a/paddle/phi/capi/lib/c_kernel_context.cc b/paddle/phi/capi/lib/c_kernel_context.cc
index e9fe2aada1f35f..7df79117dbae5d 100644
--- a/paddle/phi/capi/lib/c_kernel_context.cc
+++ b/paddle/phi/capi/lib/c_kernel_context.cc
@@ -30,7 +30,7 @@ PD_DeviceContext* PD_KernelContextGetDeviceContext(PD_KernelContext* ctx) {
   } else if (dev_ctx_type == phi::AllocationType::CPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::CPUContext*>(
         &kernel_context->GetDeviceContext<phi::CPUContext>()));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (dev_ctx_type == phi::AllocationType::GPU) {
     return reinterpret_cast<PD_DeviceContext*>(const_cast<phi::GPUContext*>(
         &kernel_context->GetDeviceContext<phi::GPUContext>()));
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 64dab3ccdeb3b4..4f238496c41494 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -138,7 +138,7 @@ inline Backend StringToBackend(const char* backend_cstr) {
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
   } else if (s == std::string("KPS")) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     // NOTE(chenweihang) KPS is not yet a complete backend, and it still needs
     // to be converted
     // to GPU in the GPU environment
diff --git a/paddle/phi/common/bfloat16.h b/paddle/phi/common/bfloat16.h
index 028851e34c8bc7..9609dc50a9a0be 100644
--- a/paddle/phi/common/bfloat16.h
+++ b/paddle/phi/common/bfloat16.h
@@ -31,7 +31,13 @@
 #include <cuda_bf16.h>
 #endif
 
-#ifndef PADDLE_WITH_HIP
+#if defined(__MUSACC__)
+#define PADDLE_MUSA_BF16
+#include <musa_bf16.h>
+#include <musa.h>
+#endif
+
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -65,13 +71,14 @@ struct PADDLE_ALIGN(2) bfloat16 {
     tempRes = reinterpret_cast<uint32_t*>(&val);
     res = *tempRes;
     x = res >> 16;
-#else
-#if defined(PADDLE_CUDA_BF16)
+#elif defined(PADDLE_CUDA_BF16)
     __nv_bfloat16 tmp = __float2bfloat16(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
+#elif defined(PADDLE_MUSA_BF16)
+    __mt_bfloat16 tmp = __float2bfloat16(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
 #else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
-#endif
 #endif
   }
 
@@ -81,6 +88,12 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
+#if defined(PADDLE_MUSA_BF16)
+  HOSTDEVICE inline explicit bfloat16(const __mt_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
+  }
+#endif
+
   template <class T>
   HOSTDEVICE inline explicit bfloat16(const T& val)
       : x(bfloat16(static_cast<float>(val)).x) {}
@@ -93,6 +106,13 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
+#if defined(PADDLE_MUSA_BF16)
+  HOSTDEVICE inline bfloat16& operator=(const __mt_bfloat16& val) {
+    x = *reinterpret_cast<const unsigned short*>(&val);  // NOLINT
+    return *this;
+  }
+#endif
+
   HOSTDEVICE inline bfloat16& operator=(bool b) {
     x = b ? 0x3f80 : 0;
     return *this;
@@ -160,16 +180,16 @@ struct PADDLE_ALIGN(2) bfloat16 {
     // return res;
     res = res << 16;
     return *reinterpret_cast<float*>(&res);
-#else
-#ifdef PADDLE_CUDA_BF16
+#elif defined(PADDLE_CUDA_BF16)
     return __bfloat162float(*reinterpret_cast<const __nv_bfloat16*>(&x));
+#elif defined(PADDLE_MUSA_BF16)
+    return __bfloat162float(*reinterpret_cast<const __mt_bfloat16*>(&x));
 #else
     float val = 0.f;
     uint16_t temp = x;
     std::memcpy(
         reinterpret_cast<char*>(&val) + 2, reinterpret_cast<char*>(&temp), 2);
     return val;
-#endif
 #endif
   }
 
@@ -179,6 +199,12 @@ struct PADDLE_ALIGN(2) bfloat16 {
   }
 #endif
 
+#ifdef PADDLE_MUSA_BF16
+  HOSTDEVICE inline __mt_bfloat16 to_mt_bfloat16() const {
+    return *reinterpret_cast<const __mt_bfloat16*>(&x);
+  }
+#endif
+
   HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
 
   HOSTDEVICE inline explicit operator int8_t() const {
diff --git a/paddle/phi/common/complex.h b/paddle/phi/common/complex.h
index 5de6290fb77057..4fb04ed0f7f666 100644
--- a/paddle/phi/common/complex.h
+++ b/paddle/phi/common/complex.h
@@ -26,12 +26,17 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <muComplex.h>
+#include <thrust/complex.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_complex.h>
 #include <thrust/complex.h>  // NOLINT
 #endif
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -41,7 +46,7 @@
 #define PADDLE_ALIGN(x)
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 // todo
 #define PADDLE_WITH_CUDA_OR_HIP_COMPLEX
 #endif
@@ -66,7 +71,7 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
 
   HOSTDEVICE complex(T real, T imag) : real(real), imag(imag) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
   template <typename T1>
   HOSTDEVICE inline explicit complex(const thrust::complex<T1>& c) {
@@ -95,6 +100,14 @@ struct PADDLE_ALIGN(sizeof(T) * 2) complex {
   HOSTDEVICE inline explicit operator hipDoubleComplex() const {
     return make_hipDoubleComplex(real, imag);
   }
+#elif defined(PADDLE_WITH_MUSA)
+  HOSTDEVICE inline explicit operator muFloatComplex() const {
+    return make_muFloatComplex(real, imag);
+  }
+
+  HOSTDEVICE inline explicit operator muDoubleComplex() const {
+    return make_muDoubleComplex(real, imag);
+  }  
 #else
   HOSTDEVICE inline explicit operator cuFloatComplex() const {
     return make_cuFloatComplex(real, imag);
diff --git a/paddle/phi/common/cpstring_impl.h b/paddle/phi/common/cpstring_impl.h
index 1906fd4e57a444..c88d4ac21cd4a0 100644
--- a/paddle/phi/common/cpstring_impl.h
+++ b/paddle/phi/common/cpstring_impl.h
@@ -26,7 +26,7 @@ limitations under the License. */
 
 #include "paddle/common/macros.h"
 
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
@@ -77,7 +77,7 @@ HOSTDEVICE static inline uint32_t swap32(uint32_t host_int) {
 }
 #endif
 
-#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__))
+#if PD_PSTRING_LITTLE_ENDIAN || (defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__))
 #define PD_le32toh(x) x
 #else  // PD_PSTRING_LITTLE_ENDIAN
 #define PD_le32toh(x) swap32(x)
@@ -209,7 +209,7 @@ HOSTDEVICE static inline void *PD_Malloc(size_t size) { return malloc(size); }
 HOSTDEVICE static inline void *PD_Realloc(void *ptr,
                                           size_t old_size UNUSED,
                                           size_t new_size) {
-#if (defined(__NVCC__) || defined(__HIPCC__))
+#if (defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__))
   if (old_size >= new_size) {
     return ptr;
   }
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 9d60b8c6241ae3..e4f4a5ae272eb9 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -37,6 +37,10 @@
 #include <cuda.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#endif  // PADDLE_WITH_MUSA
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -46,12 +50,17 @@
 #include <cuda_fp16.h>
 #endif
 
+#ifdef __MUSACC__
+#define PADDLE_CUDA_FP16
+#include <musa_fp16.h>
+#endif
+
 #ifdef __HIPCC__
 #define PADDLE_CUDA_FP16
 #include <hip/hip_fp16.h>
 #endif
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
@@ -86,8 +95,8 @@ struct PADDLE_ALIGN(2) float16 {
 // Constructors
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline explicit float16(const half& h) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
 #else
     x = h.x;
@@ -106,7 +115,7 @@ struct PADDLE_ALIGN(2) float16 {
 
   HOSTDEVICE inline explicit float16(float val) {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    (defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = __float2half(val);
     x = *reinterpret_cast<uint16_t*>(&tmp);
 
@@ -148,7 +157,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Assignment operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline float16& operator=(const half& rhs) {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || CUDA_VERSION >= 9000
     x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
 #else
     x = rhs.x;
@@ -222,7 +231,7 @@ struct PADDLE_ALIGN(2) float16 {
 // Conversion operators
 #ifdef PADDLE_CUDA_FP16
   HOSTDEVICE inline half to_half() const {
-#if defined(PADDLE_WITH_HIP) || CUDA_VERSION >= 9000
+#if defined(PADDLE_WITH_HIP)  || defined(PADDLE_WITH_MUSA)|| CUDA_VERSION >= 9000
     __half_raw h;
     h.x = x;
     return half(h);
@@ -242,7 +251,7 @@ struct PADDLE_ALIGN(2) float16 {
 
   HOSTDEVICE inline operator float() const {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
+    (defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300))
     half tmp = *reinterpret_cast<const half*>(this);
     return __half2float(tmp);
 
@@ -351,7 +360,7 @@ struct PADDLE_ALIGN(2) float16 {
 // CUDA 9.0 regarding the half data type.
 // ROCM has built-in arithmetic operators as not defined
 // __HIP_NO_HALF_OPERATORS__
-#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && CUDA_VERSION < 9000
+#if defined(PADDLE_CUDA_FP16) && !defined(__HIPCC__) && !defined(__MUSACC__) && CUDA_VERSION < 9000
 DEVICE inline half operator+(const half& a, const half& b) {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
   return __hadd(a, b);
@@ -399,7 +408,7 @@ DEVICE inline half operator-(const half& a) {
 #endif
 }
 
-#ifndef PADDLE_WITH_HIP  // not defined __HIP_NO_HALF_OPERATORS__
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA)   // not defined __HIP_NO_HALF_OPERATORS__
 DEVICE inline half& operator+=(half& a, const half& b) {  // NOLINT
   a = a + b;
   return a;
@@ -475,7 +484,7 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
 #if defined(PADDLE_CUDA_FP16)
 // HIPCC has compile error if call __device__ function __hadd, __hsub, etc.
 // in __host__ __device__ function
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline float16 operator+(const float16& a, const float16& b) {
   return float16(__hadd(a.to_half(), b.to_half()));
 }
@@ -492,7 +501,7 @@ HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline float16 operator-(const float16& a, const float16& b) {
   return float16(__hsub(a.to_half(), b.to_half()));
 }
@@ -509,7 +518,7 @@ HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline float16 operator*(const float16& a, const float16& b) {
   return float16(__hmul(a.to_half(), b.to_half()));
 }
@@ -526,7 +535,7 @@ HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline float16 operator/(const float16& a, const float16& b) {
   return float16(__hdiv(a.to_half(), b.to_half()));
 }
@@ -546,7 +555,7 @@ HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
 }
 #endif
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline float16 operator-(const float16& a) {
   return float16(__hneg(a.to_half()));
 }
@@ -589,7 +598,7 @@ HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {  // NOLINT
 
 // HIPCC has compile error if call __device__ function __heq, __hne, etc.
 // in __host__ __device__ function
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline bool operator==(const float16& a, const float16& b) {
   return __heq(a.to_half(), b.to_half());
 }
@@ -606,7 +615,7 @@ HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline bool operator!=(const float16& a, const float16& b) {
   return __hne(a.to_half(), b.to_half());
 }
@@ -623,7 +632,7 @@ HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline bool operator<(const float16& a, const float16& b) {
   return __hlt(a.to_half(), b.to_half());
 }
@@ -640,7 +649,7 @@ HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline bool operator<=(const float16& a, const float16& b) {
   return __hle(a.to_half(), b.to_half());
 }
@@ -657,7 +666,7 @@ HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline bool operator>(const float16& a, const float16& b) {
   return __hgt(a.to_half(), b.to_half());
 }
@@ -674,7 +683,7 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
 }
 #endif  // __HIPCC__
 
-#if defined(__HIPCC__)
+#if defined(__HIPCC__) || defined(__MUSACC__)
 DEVICE inline bool operator>=(const float16& a, const float16& b) {
   return __hge(a.to_half(), b.to_half());
 }
@@ -965,7 +974,7 @@ DEVICE inline bool(isnan)(const float16& a) { return __hisnan(a.to_half()); }
 HOST inline bool(isnan)(const float16& a) { return (a.x & 0x7fff) > 0x7c00; }
 #else
 HOSTDEVICE inline bool(isnan)(const float16& a) {
-#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+#if defined(PADDLE_CUDA_FP16) && ((defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || defined(__MUSACC__))
   return __hisnan(a.to_half());
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -983,7 +992,7 @@ HOSTDEVICE inline bool(isfinite)(const float16& a) {
 
 HOSTDEVICE inline float16(abs)(const float16& a) {
 #if defined(PADDLE_CUDA_FP16) && \
-    (defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530))
+    (defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530))
   return float16(::fabs(static_cast<float>(a)));
 #else
   return float16(std::abs(static_cast<float>(a)));
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index 1af8cc442a1178..a1fc14073d96ac 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -69,7 +69,7 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
                                                               dev_id);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total) {
   return MemoryUtils::Instance().GpuMemoryUsage(available, total);
 }
@@ -90,8 +90,8 @@ void EmplaceDeviceContexts(
       stream_priority);
 }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
 const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
   return MemoryUtils::Instance().GetAllocator(device_id, stream);
 }
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index 9e4e573277549a..abcc6ac003c644 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -34,6 +34,11 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <musa.h>
+#endif
+
 namespace phi {
 
 struct MemoryInterface {
@@ -128,7 +133,7 @@ struct MemoryInterface {
   int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
                                               int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   /**
    * @brief get the memory usage of current GPU device.
    *
@@ -161,8 +166,8 @@ struct MemoryInterface {
       bool disable_setting_default_stream_for_allocator,
       int stream_priority);
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL))
   phi::Allocator* (*get_allocator)(int device_id, phi::gpuStream_t stream);
   phi::Allocator* (*get_host_allocator)();
   phi::Allocator* (*get_zero_allocator)(int device_id);
@@ -292,7 +297,7 @@ class MemoryUtils {
     return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void GpuMemoryUsage(size_t* available, size_t* total) {
     CheckMemoryMethod();
     PADDLE_ENFORCE_NOT_NULL(
@@ -344,8 +349,8 @@ class MemoryUtils {
             "Fluid. You can call InitMemoryMethod() for initialization."));
   }
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL))
   const phi::Allocator* GetAllocator(int device_id, phi::gpuStream_t stream) {
     return memory_method_->get_allocator(device_id, stream);
   }
@@ -421,7 +426,7 @@ void Copy(const Place& dst_place,
 
 int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void GpuMemoryUsage(size_t* available, size_t* total);
 #endif
 
@@ -434,8 +439,8 @@ void EmplaceDeviceContexts(
     bool disable_setting_default_stream_for_allocator,
     int stream_priority);
 
-#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \
-    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL))
+#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)) && \
+    (defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)|| defined(PADDLE_WITH_MCCL))
 const Allocator* GetAllocator(int device_id, phi::gpuStream_t stream);
 
 const Allocator* GetHostAllocator();
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 008f45aa935544..c205bb7675393f 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -129,7 +129,7 @@ static int8_t GetCorrectDeviceIdByPlaceType(
   switch (place_type) {
     case paddle::PlaceType::kCPU:
       return 0;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case paddle::PlaceType::kGPU:
       return phi::backends::gpu::GetCurrentDeviceId();
 #endif
@@ -175,7 +175,7 @@ bool operator==(PlaceType place_type, const Place &place) {
 
 GPUPlace DefaultGPUPlace() {
   return GPUPlace(
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::backends::gpu::GetCurrentDeviceId());
 #else
       0);
diff --git a/paddle/phi/common/transform.h b/paddle/phi/common/transform.h
index e80561284b885f..0b1a94aa0c1b90 100644
--- a/paddle/phi/common/transform.h
+++ b/paddle/phi/common/transform.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "thrust/device_ptr.h"
@@ -92,7 +92,7 @@ struct Transform<phi::CPUContext> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 
 // PointerToThrustDevicePtr has two specializations, one casts a (CUDA
 // device) pointer into thrust::device_ptr, the other keeps rest types
@@ -153,6 +153,12 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(last),
                       CastToCUDATransformIterator(result),
                       op);
+#elif defined(__MUSACC__)
+    thrust::transform(thrust::musa::par.on(context.stream()),
+                      CastToCUDATransformIterator(first),
+                      CastToCUDATransformIterator(last),
+                      CastToCUDATransformIterator(result),
+                      op);             
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first),
@@ -184,6 +190,13 @@ struct Transform<phi::GPUContext> {
                       CastToCUDATransformIterator(first2),
                       CastToCUDATransformIterator(result),
                       op);
+#elif defined(__MUSACC__)
+    thrust::transform(thrust::musa::par.on(context.stream()),
+                      CastToCUDATransformIterator(first1),
+                      CastToCUDATransformIterator(last1),
+                      CastToCUDATransformIterator(first2),
+                      CastToCUDATransformIterator(result),
+                      op);
 #else
     thrust::transform(thrust::cuda::par.on(context.stream()),
                       CastToCUDATransformIterator(first1),
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index d4c5de0dbe6dc9..15585543417d8e 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -61,7 +61,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
       return phi::CPUPlace();
     case phi::Backend::UNDEFINED:
       return phi::Place();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPU:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -70,7 +70,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::ONEDNN:  // NOLINT
       return phi::CPUPlace();
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case phi::Backend::GPUDNN:
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
@@ -81,7 +81,7 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     case phi::Backend::KPS:
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       return phi::GPUPlace(
           set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
 #elif defined(PADDLE_WITH_XPU_KP)
diff --git a/paddle/phi/core/cuda_stream.h b/paddle/phi/core/cuda_stream.h
index b27770b0814339..50c07b6e2cc46b 100644
--- a/paddle/phi/core/cuda_stream.h
+++ b/paddle/phi/core/cuda_stream.h
@@ -23,6 +23,11 @@ limitations under the License. */
 using gpuStream_t = cudaStream_t;
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+using gpuStream_t = musaStream_t;
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 using gpuStream_t = hipStream_t;
@@ -73,6 +78,9 @@ class CUDAStream {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamCreateWithPriority(
+        &stream, static_cast<unsigned int>(flag), priority));     
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamCreateWithPriority(
         &stream, static_cast<unsigned int>(flag), priority));
@@ -92,6 +100,8 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(raw_stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamDestroy(raw_stream()));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(raw_stream()));
 #endif
@@ -112,6 +122,14 @@ class CUDAStream {
     if (err == hipErrorNotReady) {
       return false;
     }
+#elif defined(PADDLE_WITH_MUSA)
+    musaError_t err = musaStreamQuery(raw_stream());
+    if (err == musaSuccess) {
+      return true;
+    }
+    if (err == musaErrorNotReady) {
+      return false;
+    }
 #else
     cudaError_t err = cudaStreamQuery(raw_stream());
     if (err == cudaSuccess) {
@@ -134,6 +152,8 @@ class CUDAStream {
   void WaitEvent(gpuEvent_t ev) const {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(raw_stream(), ev, 0));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaStreamWaitEvent(raw_stream(), ev, 0));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(raw_stream(), ev, 0));
 #endif
@@ -146,6 +166,8 @@ class CUDAStream {
       backends::gpu::GPUDeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
       hipStreamDestroy(raw_stream());
+#elif defined(PADDLE_WITH_MUSA)
+      musaStreamDestroy(raw_stream());
 #else
       cudaStreamDestroy(raw_stream());
 #endif
diff --git a/paddle/phi/core/distributed/CMakeLists.txt b/paddle/phi/core/distributed/CMakeLists.txt
index 00000c3fff9e0f..34046df6013a57 100644
--- a/paddle/phi/core/distributed/CMakeLists.txt
+++ b/paddle/phi/core/distributed/CMakeLists.txt
@@ -4,7 +4,7 @@ add_subdirectory(auto_parallel)
 
 set(DISTRIBUTED_COMMON_SRCS comm_context_manager.cc)
 
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   list(APPEND DISTRIBUTED_COMMON_SRCS comm_task_manager.cc)
   list(APPEND DISTRIBUTED_COMMON_SRCS nccl_comm_context.cc nccl_comm_task.cc
        nccl_tools.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
index e7a1ec15da307a..9407d1fad7f428 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.cc
@@ -101,7 +101,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
           store, unique_comm_key, dev_ctx.GetPlace(), rank, world_size);
 #endif
     } else {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
       if (phi::GPUContext::classof(&dev_ctx)) {
         CommContextManager::CreateNCCLCommContext(
             store, unique_comm_key, rank, world_size);
@@ -164,7 +164,7 @@ bool NeedComputationClipForPP(
 }
 
 Place GetDefaultPlace() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (phi::backends::gpu::GetGPUDeviceCount() >= 0) {
     return paddle::DefaultGPUPlace();
   }
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
index 022dc065980641..41cfd4efca8fd7 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
@@ -71,7 +71,7 @@ std::vector<int64_t> BalancedSplit(int64_t total_nums, int64_t num_of_pieces);
 CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
                                     const std::vector<int64_t>& process_ids);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, ...)            \
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
@@ -123,7 +123,7 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
     RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \
   } while (0)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)          \
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
diff --git a/paddle/phi/core/distributed/check/CMakeLists.txt b/paddle/phi/core/distributed/check/CMakeLists.txt
index 1721a4a4602d10..964106feac4027 100644
--- a/paddle/phi/core/distributed/check/CMakeLists.txt
+++ b/paddle/phi/core/distributed/check/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(CHECK_COMMON_SRCS static_check.cc)
 
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   list(APPEND CHECK_COMMON_SRCS nccl_dynamic_check.cc)
 endif()
 
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
index 9307af45bd622b..4a7b931ad2b332 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.cc
@@ -30,6 +30,16 @@
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
 #define gpuFree hipFree
+#elif defined(PADDLE_WITH_MCCL)
+#include <musa_runtime.h>
+
+#include "paddle/phi/backends/dynload/mccl.h"
+
+#define gpuMalloc musaMalloc
+#define gpuMemcpy musaMemcpy
+#define gpuMemcpyDeviceToHost musaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice musaMemcpyHostToDevice
+#define gpuFree musaFree
 #else
 #include <cuda_runtime.h>
 
@@ -56,7 +66,7 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
 void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
                                      int root_rank,
                                      int cur_rank,
-                                     ncclComm_t comm) {
+                                     mcclComm_t comm) {
   constexpr int kSize = sizeof(int64_t);
   int64_t dtype_host = static_cast<int64_t>(tensor.dtype());
   int64_t* dtype_device;
@@ -64,10 +74,10 @@ void NCCLDynamicCheck::CheckDataType(const phi::DenseTensor& tensor,
   PADDLE_ENFORCE_GPU_SUCCESS(
       gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(dtype_device,
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(dtype_device,
                                                          dtype_device,
                                                          1,
-                                                         ncclInt64,
+                                                         mcclInt64,
                                                          root_rank,
                                                          comm,
                                                          kDefaultStream));
@@ -95,7 +105,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
 void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
                                   int root_rank,
                                   int cur_rank,
-                                  ncclComm_t comm) {
+                                  mcclComm_t comm) {
   CheckDataType(tensor, root_rank, cur_rank, comm);
 
   constexpr int kSize = sizeof(int64_t);
@@ -106,10 +116,10 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& tensor,
   PADDLE_ENFORCE_GPU_SUCCESS(
       gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclBroadcast(shape_device,
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclBroadcast(shape_device,
                                                          shape_device,
                                                          1,
-                                                         ncclInt64,
+                                                         mcclInt64,
                                                          root_rank,
                                                          comm,
                                                          kDefaultStream));
@@ -130,7 +140,7 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor,
                                   const std::vector<int64_t>& in_size_each_rank,
                                   int cur_rank,
                                   int world_size,
-                                  ncclComm_t comm) {
+                                  mcclComm_t comm) {
   CheckDataType(out_tensor, /*root_rank*/ 0, cur_rank, comm);
   CheckDataType(in_tensor, /*root_rank*/ 0, cur_rank, comm);
 
@@ -143,11 +153,11 @@ void NCCLDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor,
     PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize));
     PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(
         in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice));
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclReduce(in_shape_device,
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclReduce(in_shape_device,
                                                         in_shape_device,
                                                         1,
-                                                        ncclInt64,
-                                                        ncclSum,
+                                                        mcclInt64,
+                                                        mcclSum,
                                                         rank,
                                                         comm,
                                                         kDefaultStream));
@@ -167,7 +177,7 @@ void NCCLDynamicCheck::CheckGatherShape(
     int root_rank,
     int cur_rank,
     int world_size,
-    ncclComm_t comm) {
+    mcclComm_t comm) {
   std::vector<int64_t> shapes(world_size, 0);
   shapes[cur_rank] = in_tensor.numel();
   int64_t* in_shape_device;
@@ -178,11 +188,11 @@ void NCCLDynamicCheck::CheckGatherShape(
                                        world_size * sizeof(int64_t),
                                        gpuMemcpyHostToDevice));
 
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclAllReduce(in_shape_device,
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclAllReduce(in_shape_device,
                                                          in_shape_device,
                                                          world_size,
-                                                         ncclInt64,
-                                                         ncclSum,
+                                                         mcclInt64,
+                                                         mcclSum,
                                                          comm,
                                                          kDefaultStream));
   PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy(shapes.data(),
diff --git a/paddle/phi/core/distributed/check/nccl_dynamic_check.h b/paddle/phi/core/distributed/check/nccl_dynamic_check.h
index 23e8386d6f2aff..502ec886211e1b 100644
--- a/paddle/phi/core/distributed/check/nccl_dynamic_check.h
+++ b/paddle/phi/core/distributed/check/nccl_dynamic_check.h
@@ -21,6 +21,8 @@
 
 #if defined(PADDLE_WITH_RCCL)
 using gpuStream_t = hipStream_t;
+#elif defined(PADDLE_WITH_MCCL)
+using gpuStream_t = musaStream_t;
 #else
 using gpuStream_t = cudaStream_t;
 #endif
@@ -36,21 +38,21 @@ struct NCCLDynamicCheck {
   static void CheckDataType(const phi::DenseTensor& tensor,
                             int root_rank,
                             int cur_rank,
-                            ncclComm_t comm);
+                            mcclComm_t comm);
 
   static void CheckShape(const phi::DenseTensor& tensor, int64_t shape);
 
   static void CheckShape(const phi::DenseTensor& tensor,
                          int root_rank,
                          int cur_rank,
-                         ncclComm_t comm);
+                         mcclComm_t comm);
 
   static void CheckShape(const phi::DenseTensor& out_tensor,
                          const phi::DenseTensor& in_tensor,
                          const std::vector<int64_t>& in_size_each_rank,
                          int cur_rank,
                          int world_size,
-                         ncclComm_t comm);
+                         mcclComm_t comm);
 
   // can be used to check gather and all gather
   static void CheckGatherShape(const phi::DenseTensor& in_tensor,
@@ -58,7 +60,7 @@ struct NCCLDynamicCheck {
                                int root_rank,
                                int cur_rank,
                                int world_size,
-                               ncclComm_t comm);
+                               mcclComm_t comm);
 
  private:
   // `0` represents default stream for both cuda & hip
diff --git a/paddle/phi/core/distributed/comm_context_manager.cc b/paddle/phi/core/distributed/comm_context_manager.cc
index 5fd7861cc52b2d..2aee7c7c851042 100644
--- a/paddle/phi/core/distributed/comm_context_manager.cc
+++ b/paddle/phi/core/distributed/comm_context_manager.cc
@@ -29,7 +29,7 @@
 #include "paddle/phi/core/distributed/store/gloo_store.h"
 #endif
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
@@ -49,13 +49,13 @@ namespace distributed {
 int CommContextManager::device_id = -1;
 
 void CommContextManager::SetDeviceId(int dev_id) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   phi::backends::gpu::SetDeviceId(dev_id);
   CommContextManager::device_id = dev_id;
 #endif
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 void CommContextManager::CreateNCCLCommContext(
     const std::shared_ptr<Store>& store,
     const std::string& unique_comm_key,
@@ -67,16 +67,16 @@ void CommContextManager::CreateNCCLCommContext(
   if (comm_context_manager.Has(unique_comm_key)) {
     return;
   }
-  ncclUniqueId nccl_id;
+  mcclUniqueId nccl_id;
   if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclGetUniqueId(&nccl_id));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclGetUniqueId(&nccl_id));
   }
 
   std::string unique_key = "NCCLCommContext/" + unique_comm_key + hash_key;
   if (rank == 0 || (p2p_opt && p2p_opt->is_p2p_op && p2p_opt->p2p_rank == 0)) {
     std::vector<uint8_t> nccl_id_wrapper(
         reinterpret_cast<uint8_t*>(&nccl_id),
-        reinterpret_cast<uint8_t*>(&nccl_id) + NCCL_UNIQUE_ID_BYTES);
+        reinterpret_cast<uint8_t*>(&nccl_id) + MCCL_UNIQUE_ID_BYTES);
     store->set(unique_key, nccl_id_wrapper);
   } else {
     const auto& nccl_id_wrapper = store->get(unique_key);
@@ -231,8 +231,8 @@ CommContext* CommContextManager::Get(const std::string& unique_comm_key) const {
   return id_to_comm_context_.at(unique_comm_key).get();
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-int CommContextManager::GetRingId(const ncclComm_t& comm) const {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+int CommContextManager::GetRingId(const mcclComm_t& comm) const {
   for (auto iter = id_to_comm_context_.begin();
        iter != id_to_comm_context_.end();
        ++iter) {
diff --git a/paddle/phi/core/distributed/comm_context_manager.h b/paddle/phi/core/distributed/comm_context_manager.h
index 8c4d802294986f..5c3f3101dcada6 100644
--- a/paddle/phi/core/distributed/comm_context_manager.h
+++ b/paddle/phi/core/distributed/comm_context_manager.h
@@ -24,7 +24,7 @@
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/distributed/comm_context.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/backends/gpu/forwards.h"
 #endif
 
@@ -57,8 +57,8 @@ class CommContextManager {
 
   CommContext* Get(const std::string& unique_comm_key) const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  int GetRingId(const ncclComm_t& comm) const;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+  int GetRingId(const mcclComm_t& comm) const;
 #endif
 
   bool Has(const std::string& unique_comm_key) const;
@@ -71,7 +71,7 @@ class CommContextManager {
 
   std::vector<int> GetGroupRanks(const std::string& pg_key) const;
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   static void CreateNCCLCommContext(const std::shared_ptr<Store>& store,
                                     const std::string& unique_comm_key,
                                     int rank,
diff --git a/paddle/phi/core/distributed/comm_task.h b/paddle/phi/core/distributed/comm_task.h
index 47ba01b980479a..ca7f8495495d2d 100644
--- a/paddle/phi/core/distributed/comm_task.h
+++ b/paddle/phi/core/distributed/comm_task.h
@@ -25,6 +25,9 @@
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
 #endif
+#if defined(PADDLE_WITH_MCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif
@@ -43,7 +46,7 @@ class CommTask {
            int gid = 0,
            uint64_t seq = 0,
            int64_t numel = 0,
-           ncclComm_t nccl_comm = nullptr,
+           mcclComm_t nccl_comm = nullptr,
            gpuStream_t nccl_stream = nullptr,
            CommType comm_type = CommType::UNKNOWN)
       : backend_(backend),
@@ -89,7 +92,7 @@ class CommTask {
   std::shared_ptr<Store> GetStore() { return store_; }
   void SetStore(std::shared_ptr<Store> store) { store_ = store; }
 
-  ncclComm_t nccl_comm() { return nccl_comm_; }
+  mcclComm_t nccl_comm() { return nccl_comm_; }
   gpuStream_t nccl_stream() { return nccl_stream_; }
 
   virtual std::string GetTraceMsg() {
@@ -160,7 +163,7 @@ class CommTask {
   int gid_;
   uint64_t seq_{0};
   int64_t numel_;
-  ncclComm_t nccl_comm_;
+  mcclComm_t nccl_comm_;
   gpuStream_t nccl_stream_;
   CommType comm_type_;
   bool start_trace_updated_{false};
diff --git a/paddle/phi/core/distributed/comm_task_manager.cc b/paddle/phi/core/distributed/comm_task_manager.cc
index ae7de422913587..822b3892ec3646 100644
--- a/paddle/phi/core/distributed/comm_task_manager.cc
+++ b/paddle/phi/core/distributed/comm_task_manager.cc
@@ -32,7 +32,7 @@
 #include "paddle/phi/core/distributed/store/store.h"
 #include "paddle/phi/core/enforce.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/comm_task_manager.h"
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
diff --git a/paddle/phi/core/distributed/nccl_comm_context.cc b/paddle/phi/core/distributed/nccl_comm_context.cc
index 8da676e74d911a..4600d2e14cdbbe 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.cc
+++ b/paddle/phi/core/distributed/nccl_comm_context.cc
@@ -30,16 +30,16 @@ namespace distributed {
 // set this flag to `true` and recompile to enable dynamic checks
 constexpr bool FLAGS_enable_nccl_dynamic_check = false;
 
-NCCLCommContext::NCCLCommContext(int rank, int size, ncclUniqueId nccl_id)
+NCCLCommContext::NCCLCommContext(int rank, int size, mcclUniqueId nccl_id)
     : CommContext(rank, size) {
-  NCCL_CHECK(
-      phi::dynload::ncclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
-  NCCL_CHECK(phi::dynload::ncclGetVersion(&nccl_version_));
+  MCCL_CHECK(
+      phi::dynload::mcclCommInitRank(&nccl_comm_, size_, nccl_id, rank_));
+  MCCL_CHECK(phi::dynload::mcclGetVersion(&nccl_version_));
 }
 
 int NCCLCommContext::GetNcclVersion() { return nccl_version_; }
 
-ncclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
+mcclComm_t NCCLCommContext::GetNcclComm() { return nccl_comm_; }
 
 gpuStream_t NCCLCommContext::GetStream() { return dev_ctx_->stream(); }
 
@@ -77,7 +77,7 @@ void NCCLCommContext::Broadcast(phi::DenseTensor* out_tensor,
   if (FLAGS_enable_nccl_dynamic_check) {
     NCCLDynamicCheck::CheckShape(*out_tensor, root, rank_, nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclBroadcast(in_tensor.data(),
+  MCCL_CHECK(phi::dynload::mcclBroadcast(in_tensor.data(),
                                          out_tensor->data(),
                                          in_tensor.numel(),
                                          ToNCCLDataType(in_tensor.type()),
@@ -100,7 +100,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclAllGather(in_tensor.data(),
+  MCCL_CHECK(phi::dynload::mcclAllGather(in_tensor.data(),
                                          out_tensor->data(),
                                          in_tensor.numel(),
                                          ToNCCLDataType(in_tensor.type()),
@@ -109,7 +109,7 @@ void NCCLCommContext::AllGather(phi::DenseTensor* out_tensor,
 }
 void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
                                     const phi::DenseTensor& in_tensor,
-                                    ncclRedOp_t reduce_type,
+                                    mcclRedOp_t reduce_type,
                                     gpuStream_t stream) {
   phi::distributed::CommStaticCheck::ScatterLikeShape(*out_tensor,
                                                       in_tensor,
@@ -122,7 +122,7 @@ void NCCLCommContext::ReduceScatter(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclReduceScatter(in_tensor.data(),
+  MCCL_CHECK(phi::dynload::mcclReduceScatter(in_tensor.data(),
                                              out_tensor->data(),
                                              out_tensor->numel(),
                                              ToNCCLDataType(in_tensor.type()),
@@ -141,7 +141,7 @@ void NCCLCommContext::Send(const phi::DenseTensor& in_tensor,
     NCCLDynamicCheck::CheckShape(in_tensor, rank_, rank_, nccl_comm_);
   }
 
-  NCCL_CHECK(phi::dynload::ncclSend(in_tensor.data(),
+  MCCL_CHECK(phi::dynload::mcclSend(in_tensor.data(),
                                     count,
                                     ToNCCLDataType(in_tensor.dtype()),
                                     peer,
@@ -160,7 +160,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor,
     NCCLDynamicCheck::CheckShape(*out_tensor, peer, rank_, nccl_comm_);
   }
 
-  NCCL_CHECK(phi::dynload::ncclRecv(out_tensor->data(),
+  MCCL_CHECK(phi::dynload::mcclRecv(out_tensor->data(),
                                     count,
                                     ToNCCLDataType(out_tensor->dtype()),
                                     peer,
@@ -172,7 +172,7 @@ void NCCLCommContext::Recv(phi::DenseTensor* out_tensor,
 
 void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
                                 const phi::DenseTensor& in_tensor,
-                                ncclRedOp_t reduce_type,
+                                mcclRedOp_t reduce_type,
                                 gpuStream_t stream) {
   phi::distributed::CommStaticCheck::SameShape(*out_tensor,
                                                in_tensor,
@@ -185,7 +185,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclAllReduce(in_tensor.data(),
+  MCCL_CHECK(phi::dynload::mcclAllReduce(in_tensor.data(),
                                          out_tensor->data(),
                                          in_tensor.numel(),
                                          ToNCCLDataType(in_tensor.type()),
@@ -196,7 +196,7 @@ void NCCLCommContext::AllReduce(phi::DenseTensor* out_tensor,
 
 void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
                              const phi::DenseTensor& in_tensor,
-                             ncclRedOp_t reduce_type,
+                             mcclRedOp_t reduce_type,
                              int root,
                              gpuStream_t stream) {
   phi::distributed::CommStaticCheck::SameShape(*out_tensor,
@@ -210,7 +210,7 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
                                                    rank_,
                                                    nccl_comm_);
   }
-  NCCL_CHECK(phi::dynload::ncclReduce(in_tensor.data(),
+  MCCL_CHECK(phi::dynload::mcclReduce(in_tensor.data(),
                                       out_tensor->data(),
                                       in_tensor.numel(),
                                       ToNCCLDataType(in_tensor.type()),
@@ -221,23 +221,23 @@ void NCCLCommContext::Reduce(phi::DenseTensor* out_tensor,
 }
 
 void NCCLCommContext::GroupStart() {
-  NCCL_CHECK(phi::dynload::ncclGroupStart());
+  MCCL_CHECK(phi::dynload::mcclGroupStart());
 }
-void NCCLCommContext::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); }
+void NCCLCommContext::GroupEnd() { MCCL_CHECK(phi::dynload::mcclGroupEnd()); }
 
-#if NCCL_VERSION_CODE >= 21100
-void NCCLCommContext::RedOpCreatePreMulSum(ncclRedOp_t* op,
+// #if NCCL_VERSION_CODE >= 21100
+void NCCLCommContext::RedOpCreatePreMulSum(mcclRedOp_t* op,
                                            void* scalar,
-                                           ncclDataType_t dtype,
-                                           ncclScalarResidence_t residence) {
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpCreatePreMulSum(
+                                           mcclDataType_t dtype,
+                                           mcclScalarResidence_t residence) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpCreatePreMulSum(
       op, scalar, dtype, residence, nccl_comm_));
 }
 
-void NCCLCommContext::RedOpDestroy(ncclRedOp_t op) {
-  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRedOpDestroy(op, nccl_comm_));
+void NCCLCommContext::RedOpDestroy(mcclRedOp_t op) {
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mcclRedOpDestroy(op, nccl_comm_));
 }
-#endif
+// #endif
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/nccl_comm_context.h b/paddle/phi/core/distributed/nccl_comm_context.h
index 609b5e0defe079..e7a73f12046721 100644
--- a/paddle/phi/core/distributed/nccl_comm_context.h
+++ b/paddle/phi/core/distributed/nccl_comm_context.h
@@ -18,6 +18,11 @@
 #include <cuda_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #endif
@@ -29,6 +34,8 @@
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
+#elif defined(PADDLE_WITH_MCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
 #else
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif
@@ -39,12 +46,12 @@ namespace distributed {
 
 class NCCLCommContext final : public CommContext {
  public:
-  NCCLCommContext(int rank, int size, ncclUniqueId nccl_id);
+  NCCLCommContext(int rank, int size, mcclUniqueId nccl_id);
   ~NCCLCommContext() override = default;
 
   int GetNcclVersion();
 
-  ncclComm_t GetNcclComm();
+  mcclComm_t GetNcclComm();
 
   gpuStream_t GetStream();
 
@@ -80,7 +87,7 @@ class NCCLCommContext final : public CommContext {
 
   void ReduceScatter(phi::DenseTensor* out_tensor,
                      const phi::DenseTensor& in_tensor,
-                     ncclRedOp_t reduce_type,
+                     mcclRedOp_t reduce_type,
                      gpuStream_t stream);
 
   void AllGather(phi::DenseTensor* out_tensor,
@@ -89,12 +96,12 @@ class NCCLCommContext final : public CommContext {
 
   void AllReduce(phi::DenseTensor* out_tensor,
                  const phi::DenseTensor& in_tensor,
-                 ncclRedOp_t reduce_type,
+                 mcclRedOp_t reduce_type,
                  gpuStream_t stream);
 
   void Reduce(phi::DenseTensor* out_tensor,
               const phi::DenseTensor& in_tensor,
-              ncclRedOp_t reduce_type,
+              mcclRedOp_t reduce_type,
               int root,
               gpuStream_t stream);
 
@@ -102,25 +109,25 @@ class NCCLCommContext final : public CommContext {
 
   void GroupEnd();
 
-#if NCCL_VERSION_CODE >= 21100
+// #if NCCL_VERSION_CODE >= 21100
   // Creates a new reduction operator which pre-multiplies input values by a
   // given scalar locally before reducing them with peer values via summation.
-  void RedOpCreatePreMulSum(ncclRedOp_t* op,
+  void RedOpCreatePreMulSum(mcclRedOp_t* op,
                             void* scalar,
-                            ncclDataType_t dtype,
-                            ncclScalarResidence_t residence);
+                            mcclDataType_t dtype,
+                            mcclScalarResidence_t residence);
 
   // Destroys the reduction operator op. The operator must have been created by
   // ncclRedOpCreatePreMul with the matching communicator comm.
-  void RedOpDestroy(ncclRedOp_t op);
-#endif
+  void RedOpDestroy(mcclRedOp_t op);
+// #endif
 
  private:
   DISABLE_COPY_AND_ASSIGN(NCCLCommContext);
 
   int nccl_version_;
 
-  ncclComm_t nccl_comm_;
+  mcclComm_t nccl_comm_;
 
   std::unique_ptr<phi::GPUContext> dev_ctx_;
 
diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc
index 4e2efea0068eb9..5f11c8101df938 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.cc
+++ b/paddle/phi/core/distributed/nccl_comm_task.cc
@@ -33,7 +33,7 @@ NCCLCommTask::NCCLCommTask(const phi::Place& place,
                            int64_t numel,
                            bool sync_op,
                            bool use_calc_stream,
-                           ncclComm_t nccl_comm,
+                           mcclComm_t nccl_comm,
                            gpuStream_t stream,
                            CommType comm_type,
                            int64_t timeout)
@@ -62,6 +62,8 @@ void NCCLCommTask::StartRecord() {
   if (!start_event_created_) {
 #ifdef PADDLE_WITH_CUDA
     CUDA_CHECK(cudaEventCreateWithFlags(&nccl_start_event_, cuda_event_flags_));
+#elif defined(PADDLE_WITH_MUSA)
+    MUSA_CHECK(musaEventCreateWithFlags(&nccl_start_event_, musa_event_flags_));
 #else  // PADDLE_WITH_HIP
     HIP_CHECK(hipEventCreateWithFlags(&nccl_start_event_, hip_event_flags_));
 #endif
@@ -69,6 +71,8 @@ void NCCLCommTask::StartRecord() {
   }
 #ifdef PADDLE_WITH_CUDA
   CUDA_CHECK(cudaEventRecord(nccl_start_event_, nccl_stream_));
+#elif defined(PADDLE_WITH_MUSA)
+  MUSA_CHECK(musaEventRecord(nccl_start_event_, nccl_stream_));
 #else  // PADDLE_WITH_HIP
   HIP_CHECK(hipEventRecord(nccl_start_event_, nccl_stream_));
 #endif
@@ -78,6 +82,8 @@ void NCCLCommTask::EndRecord() {
   if (!end_event_created_) {
 #ifdef PADDLE_WITH_CUDA
     CUDA_CHECK(cudaEventCreateWithFlags(&nccl_end_event_, cuda_event_flags_));
+#elif defined(PADDLE_WITH_MUSA)
+    MUSA_CHECK(musaEventCreateWithFlags(&nccl_end_event_, musa_event_flags_));
 #else  // PADDLE_WITH_HIP
     HIP_CHECK(hipEventCreateWithFlags(&nccl_end_event_, hip_event_flags_));
 #endif
@@ -85,6 +91,8 @@ void NCCLCommTask::EndRecord() {
   }
 #ifdef PADDLE_WITH_CUDA
   CUDA_CHECK(cudaEventRecord(nccl_end_event_, nccl_stream_));
+#elif defined(PADDLE_WITH_MUSA)
+  MUSA_CHECK(musaEventRecord(nccl_end_event_, nccl_stream_));
 #else  // PADDLE_WITH_HIP
   HIP_CHECK(hipEventRecord(nccl_end_event_, nccl_stream_));
 #endif
@@ -103,6 +111,19 @@ void NCCLCommTask::ClearRecord() {
     end_event_created_ = false;
   }
 }
+#elif defined(PADDLE_WITH_MUSA)
+void NCCLCommTask::ClearRecord() {
+  if (start_event_created_) {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    MUSA_CHECK(musaEventDestroy(nccl_start_event_));
+    start_event_created_ = false;
+  }
+  if (end_event_created_) {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    MUSA_CHECK(musaEventDestroy(nccl_end_event_));
+    end_event_created_ = false;
+  }
+}
 #else  // PADDLE_WITH_HIP
 void NCCLCommTask::ClearRecord() {
   if (start_event_created_) {
@@ -129,6 +150,16 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
     // ignore and clear the error if not ready
     CUDA_CHECK(cudaGetLastError());
   }
+#elif defined(PADDLE_WITH_MUSA)
+  musaError_t ret = musaEventQuery(event);
+  if (ret == musaSuccess) {
+    return true;
+  } else if (ret != musaErrorNotReady) {
+    MUSA_CHECK(ret);
+  } else {
+    // ignore and clear the error if not ready
+    MUSA_CHECK(musaGetLastError());
+  }  
 #else  // PADDLE_WITH_HIP
   hipError_t ret = hipEventQuery(event);
   if (ret == hipSuccess) {
@@ -143,7 +174,7 @@ bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
   return false;
 }
 
-std::string GetNCCLErrorDetail(ncclResult_t result) {
+std::string GetNCCLErrorDetail(mcclResult_t result) {
   std::string detail;
   std::string last_error;
 #ifdef ENABLE_NCCL_GET_LAST_ERROR
@@ -151,10 +182,10 @@ std::string GetNCCLErrorDetail(ncclResult_t result) {
       ", Last error: " + std::string(phi::dynload::ncclGetLastError(NULL));
 #endif
   switch (result) {
-    case ncclUnhandledCudaError:
+    case mcclUnhandledCudaError:
       detail = "ncclUnhandledCudaError: Call to CUDA function failed.";
       break;
-    case ncclSystemError:
+    case mcclSystemError:
       detail =
           "ncclSystemError: System call (e.g. socket, malloc) or external "
           "library call failed or device error. ";
@@ -164,13 +195,13 @@ std::string GetNCCLErrorDetail(ncclResult_t result) {
       detail += "It can be also caused by unexpected exit of a remote peer.";
 #endif
       break;
-    case ncclInternalError:
+    case mcclInternalError:
       detail = "ncclInternalError: Internal check failed.";
       break;
-    case ncclInvalidArgument:
+    case mcclInvalidArgument:
       detail = "ncclInvalidArgument: Invalid value for an argument.";
       break;
-    case ncclInvalidUsage:
+    case mcclInvalidUsage:
       detail =
           "ncclInvalidUsage: This usually reflects invalid usage of NCCL "
           "library.";
@@ -194,10 +225,10 @@ std::string NCCLCommTask::GetCommErrors() {
     return comm_error_;
   }
 
-  ncclResult_t nccl_async_error;
-  NCCL_CHECK(
-      phi::dynload::ncclCommGetAsyncError(nccl_comm_, &nccl_async_error));
-  if (nccl_async_error != ncclSuccess) {
+  mcclResult_t nccl_async_error;
+  MCCL_CHECK(
+      phi::dynload::mcclCommGetAsyncError(nccl_comm_, &nccl_async_error));
+  if (nccl_async_error != mcclSuccess) {
     comm_error_ =
         "\n\t Find nccl comm error: " + GetNCCLErrorDetail(nccl_async_error);
   }
@@ -241,7 +272,7 @@ void NCCLCommTask::AbortComm() {
   if (aborted_) {
     return;
   }
-  NCCL_CHECK(phi::dynload::ncclCommAbort(nccl_comm_));
+  MCCL_CHECK(phi::dynload::mcclCommAbort(nccl_comm_));
 
   aborted_ = true;
   nccl_comm_ = nullptr;
diff --git a/paddle/phi/core/distributed/nccl_comm_task.h b/paddle/phi/core/distributed/nccl_comm_task.h
index fca9004cf0b2d4..11bbbd1c9dcf70 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.h
+++ b/paddle/phi/core/distributed/nccl_comm_task.h
@@ -21,6 +21,8 @@
 
 #if defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/backends/dynload/rccl.h"
+#elif defined(PADDLE_WITH_MCCL)
+#include "paddle/phi/backends/dynload/mccl.h"
 #else
 #include "paddle/phi/backends/dynload/nccl.h"
 #endif
@@ -42,7 +44,7 @@ class NCCLCommTask : public CommTask {
                int64_t numel = 0,
                bool sync_op = true,
                bool use_calc_stream = false,
-               ncclComm_t = nullptr,
+               mcclComm_t = nullptr,
                gpuStream_t = nullptr,
                CommType comm_type = CommType::UNKNOWN,
                int64_t timeout = DefaultTimeout);
@@ -71,6 +73,8 @@ class NCCLCommTask : public CommTask {
 
 #ifdef PADDLE_WITH_CUDA
   unsigned int cuda_event_flags_ = cudaEventDisableTiming;
+#elif defined(PADDLE_WITH_MUSA)
+  unsigned int musa_event_flags_ = musaEventDisableTiming;
 #else  // PADDLE_WITH_HIP
   unsigned int hip_event_flags_ = hipEventDisableTiming;
 #endif
diff --git a/paddle/phi/core/distributed/nccl_tools.cc b/paddle/phi/core/distributed/nccl_tools.cc
index a5388796d1f45b..24a1f3ee7891d1 100644
--- a/paddle/phi/core/distributed/nccl_tools.cc
+++ b/paddle/phi/core/distributed/nccl_tools.cc
@@ -19,74 +19,74 @@
 #include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
 
-#if NCCL_VERSION_CODE >= 21300
+// #if NCCL_VERSION_CODE >= 21300
 #define ENABLE_NCCL_GET_LAST_ERROR
 #define NCCL_REMOTE_ERROR
-#endif
+// #endif
 
 namespace phi {
 namespace distributed {
 
-ncclRedOp_t ToNCCLRedType(ReduceOp reduction) {
-  static const std::unordered_map<ReduceOp, ncclRedOp_t> red_type = {
-      {ReduceOp::MIN, ncclMin},
-      {ReduceOp::MAX, ncclMax},
-      {ReduceOp::SUM, ncclSum},
-      {ReduceOp::PRODUCT, ncclProd},
+mcclRedOp_t ToNCCLRedType(ReduceOp reduction) {
+  static const std::unordered_map<ReduceOp, mcclRedOp_t> red_type = {
+      {ReduceOp::MIN, mcclMin},
+      {ReduceOp::MAX, mcclMax},
+      {ReduceOp::SUM, mcclSum},
+      {ReduceOp::PRODUCT, mcclProd},
   };
   auto it = red_type.find(reduction);
   PADDLE_ENFORCE_EQ(it != red_type.end(),
                     true,
                     phi::errors::InvalidArgument(
-                        "Invalid nccl reduction. Must be ncclMin | ncclMax | "
-                        "ncclProd | ncclSum"));
+                        "Invalid nccl reduction. Must be mcclMin | mcclMax | "
+                        "mcclProd | mcclSum"));
   return it->second;
 }
 
-std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) {
+std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID) {
   const uint8_t* bytes = reinterpret_cast<const uint8_t*>(&ncclID);
   std::ostringstream oss;
-  for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) {
+  for (auto i = 0; i < MCCL_UNIQUE_ID_BYTES; ++i) {
     oss << std::hex << static_cast<int>(bytes[i]);
   }
   return oss.str();
 }
 
-std::string NCCLDTypeToString(ncclDataType_t dtype) {
+std::string NCCLDTypeToString(mcclDataType_t dtype) {
 #define PD_NCCL_DTYPE_TO_STR(__nccl_dtype, __str_dtype) \
   if (dtype == __nccl_dtype) return __str_dtype;
-  PD_NCCL_DTYPE_TO_STR(ncclFloat, "float32");
-  PD_NCCL_DTYPE_TO_STR(ncclFloat32, "float32");
-  PD_NCCL_DTYPE_TO_STR(ncclHalf, "float16");
-  PD_NCCL_DTYPE_TO_STR(ncclFloat16, "float16");
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
-  PD_NCCL_DTYPE_TO_STR(ncclBfloat16, "bfloat16");
-#endif
-  PD_NCCL_DTYPE_TO_STR(ncclDouble, "float64");
-  PD_NCCL_DTYPE_TO_STR(ncclFloat64, "float64");
+  PD_NCCL_DTYPE_TO_STR(mcclFloat, "float32");
+  PD_NCCL_DTYPE_TO_STR(mcclFloat32, "float32");
+  PD_NCCL_DTYPE_TO_STR(mcclHalf, "float16");
+  PD_NCCL_DTYPE_TO_STR(mcclFloat16, "float16");
+// // #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+//   PD_NCCL_DTYPE_TO_STR(mcclBfloat16, "bfloat16");
+// // #endif
+  PD_NCCL_DTYPE_TO_STR(mcclDouble, "float64");
+  PD_NCCL_DTYPE_TO_STR(mcclFloat64, "float64");
 
-  PD_NCCL_DTYPE_TO_STR(ncclInt8, "int8");
-  PD_NCCL_DTYPE_TO_STR(ncclChar, "int8");
-  PD_NCCL_DTYPE_TO_STR(ncclUint8, "uint8");
-  PD_NCCL_DTYPE_TO_STR(ncclInt32, "int32");
-  PD_NCCL_DTYPE_TO_STR(ncclInt, "int32");
-  PD_NCCL_DTYPE_TO_STR(ncclUint32, "uint32");
-  PD_NCCL_DTYPE_TO_STR(ncclInt64, "int64");
-  PD_NCCL_DTYPE_TO_STR(ncclUint64, "uint64");
+  PD_NCCL_DTYPE_TO_STR(mcclInt8, "int8");
+  PD_NCCL_DTYPE_TO_STR(mcclChar, "int8");
+  PD_NCCL_DTYPE_TO_STR(mcclUint8, "uint8");
+  PD_NCCL_DTYPE_TO_STR(mcclInt32, "int32");
+  PD_NCCL_DTYPE_TO_STR(mcclInt, "int32");
+  PD_NCCL_DTYPE_TO_STR(mcclUint32, "uint32");
+  PD_NCCL_DTYPE_TO_STR(mcclInt64, "int64");
+  PD_NCCL_DTYPE_TO_STR(mcclUint64, "uint64");
 
 #undef PD_NCCL_DTYPE_TO_STR
   PADDLE_THROW(phi::errors::InvalidArgument(
       "This datatype %d in nccl is not supported.", static_cast<int>(dtype)));
 }
 
-std::string NCCLRedTypeToString(ncclRedOp_t op) {
-  if (op == ncclSum) return "SUM";
-  if (op == ncclProd) return "PROD";
-  if (op == ncclMin) return "MIN";
-  if (op == ncclMax) return "MAX";
-#if NCCL_VERSION_CODE >= 21000
-  if (op == ncclAvg) return "AVG";
-#endif
+std::string NCCLRedTypeToString(mcclRedOp_t op) {
+  if (op == mcclSum) return "SUM";
+  if (op == mcclProd) return "PROD";
+  if (op == mcclMin) return "MIN";
+  if (op == mcclMax) return "MAX";
+// #if NCCL_VERSION_CODE >= 21000
+  if (op == mcclAvg) return "AVG";
+// #endif
   return "UDF_" + std::to_string(op);
 }
 
diff --git a/paddle/phi/core/distributed/nccl_tools.h b/paddle/phi/core/distributed/nccl_tools.h
index 0ab380a4177838..e256d4ef4d0093 100644
--- a/paddle/phi/core/distributed/nccl_tools.h
+++ b/paddle/phi/core/distributed/nccl_tools.h
@@ -21,6 +21,9 @@
 #ifdef PADDLE_WITH_RCCL
 #include <hip/hip_runtime.h>
 #include "paddle/phi/backends/dynload/rccl.h"
+#elif defined(PADDLE_WITH_MCCL)
+#include <musa_runtime.h>
+#include "paddle/phi/backends/dynload/mccl.h"
 #else
 #include <cuda_runtime.h>
 #include "paddle/phi/backends/dynload/nccl.h"
@@ -32,7 +35,7 @@ namespace distributed {
 #define NCCL_CHECK(cmd)                                                \
   do {                                                                 \
     ncclResult_t r = cmd;                                              \
-    if (r != ncclSuccess) {                                            \
+    if (r != mcclSuccess) {                                            \
       PADDLE_THROW(                                                    \
           phi::errors::External("Failed, NCCL error %s:%d '%s'\n",     \
                                 __FILE__,                              \
@@ -41,6 +44,18 @@ namespace distributed {
     }                                                                  \
   } while (0)
 
+#define MCCL_CHECK(cmd)                                                \
+  do {                                                                 \
+    mcclResult_t r = cmd;                                              \
+    if (r != mcclSuccess) {                                            \
+      PADDLE_THROW(                                                    \
+          phi::errors::External("Failed, MCCL error %s:%d '%s'\n",     \
+                                __FILE__,                              \
+                                __LINE__,                              \
+                                phi::dynload::mcclGetErrorString(r))); \
+    }                                                                  \
+  } while (0)
+
 #ifdef PADDLE_WITH_NCCL
 #define CUDA_CHECK(expr)                                                    \
   do {                                                                      \
@@ -52,6 +67,17 @@ namespace distributed {
                                          cudaGetErrorString(r)));           \
     }                                                                       \
   } while (0)
+#elif defined(PADDLE_WITH_MCCL)
+#define MUSA_CHECK(expr)                                                    \
+  do {                                                                      \
+    musaError_t r = expr;                                                   \
+    if (r != musaSuccess) {                                                 \
+      PADDLE_THROW(phi::errors::External("Failed, musa error %s:%d '%s'\n", \
+                                         __FILE__,                          \
+                                         __LINE__,                          \
+                                         musaGetErrorString(r)));           \
+    }                                                                       \
+  } while (0)
 #else  // PADDLE_WITH_RCCL
 #define HIP_CHECK(expr)                                                    \
   do {                                                                     \
@@ -65,13 +91,13 @@ namespace distributed {
   } while (0)
 #endif
 
-ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
+mcclRedOp_t ToNCCLRedType(ReduceOp reduction);
 
-std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID);
+std::string SerializeNCCLUniqueId(const mcclUniqueId& ncclID);
 
-std::string NCCLDTypeToString(ncclDataType_t dtype);
+std::string NCCLDTypeToString(mcclDataType_t dtype);
 
-std::string NCCLRedTypeToString(ncclRedOp_t op);
+std::string NCCLRedTypeToString(mcclRedOp_t op);
 
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
index 61e502951f24ee..0c21ffac88703f 100644
--- a/paddle/phi/core/enforce.h
+++ b/paddle/phi/core/enforce.h
@@ -23,6 +23,16 @@ limitations under the License. */
 #include <thrust/system_error.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include <mublas.h>
+#include <mudnn.h>
+#include <mufft.h>
+#include <murand.h>
+#include <musparse.h>
+#include <thrust/system/musa/error.h>
+#include <thrust/system_error.h>
+#endif 
+
 #ifdef PADDLE_WITH_HIP
 #include <hiprand.h>
 #include <miopen/miopen.h>
@@ -55,6 +65,17 @@ limitations under the License. */
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/mufft.h"
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/dynload/mudnn.h"
+#include "paddle/phi/backends/dynload/murand.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+#include <error.h>
+#include "paddle/phi/backends/dynload/mccl.h"
+#endif  // __APPLE__
+#endif 
+
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/hipfft.h"
 #include "paddle/phi/backends/dynload/hiprand.h"
@@ -69,7 +90,7 @@ limitations under the License. */
 // Note: these headers for simplify demangle type string
 #include "paddle/phi/core/type_defs.h"
 // Note: this header for simplify HIP and CUDA type string
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_types.h"
 #endif
 #if defined(PADDLE_WITH_XPU_BKCL)
@@ -326,6 +347,17 @@ struct EnforceNotMet : public std::exception {
       abort();                                                     \
     }                                                              \
   } while (0)
+#elif defined(__MUSACC__)
+#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...)               \
+  do {                                                             \
+    if (!(_IS_NOT_ERROR)) {                                        \
+      printf("Error: %s:%d Assertion `%s` failed. " __FORMAT "\n", \
+             __FILE__,                                             \
+             __LINE__,                                             \
+             #_IS_NOT_ERROR,                                       \
+             ##__VA_ARGS__);                                       \
+    }                                                              \
+  } while (0)
 #else
 #define PADDLE_ENFORCE(COND, ...)                               \
   do {                                                          \
@@ -570,7 +602,7 @@ DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS);
 DEFINE_EXTERNAL_API_TYPE(CUresult, CUDA_SUCCESS);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
-DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
+DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess);
 #endif
 
 }  // namespace details
@@ -666,7 +698,7 @@ inline std::string build_nvidia_error_msg(CUresult stat) {
 /**************** NCCL ERROR ****************/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
-  return nccl_result != ncclSuccess;
+  return nccl_result != mcclSuccess;
 }
 
 inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
@@ -867,7 +899,7 @@ inline std::string build_rocm_error_msg(rocblas_status stat) {
 /****** RCCL ERROR ******/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
 inline bool is_error(ncclResult_t nccl_result) {
-  return nccl_result != ncclSuccess;
+  return nccl_result != mcclSuccess;
 }
 
 inline std::string build_rocm_error_msg(ncclResult_t nccl_result) {
@@ -903,7 +935,7 @@ DEFINE_EXTERNAL_API_TYPE(rocblas_status, rocblas_status_success);
 DEFINE_EXTERNAL_API_TYPE(hipfftResult_t, HIPFFT_SUCCESS);
 
 #if !defined(__APPLE__) && defined(PADDLE_WITH_RCCL)
-DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess);
+DEFINE_EXTERNAL_API_TYPE(ncclResult_t, mcclSuccess);
 #endif
 
 }  // namespace details
@@ -958,7 +990,7 @@ inline void retry_sleep(unsigned millisecond) {
     }                                                                   \
     if (UNLIKELY(__cond__ != __success_type__)) {                       \
       auto __summary__ = phi::errors::External(                         \
-          ::phi::enforce::build_rocm_error_msg(__cond__));              \
+          ::phi::enforce::build_musa_error_msg(__cond__));              \
       __THROW_ERROR_INTERNAL__(__summary__);                            \
     }                                                                   \
   } while (0)
@@ -966,6 +998,234 @@ inline void retry_sleep(unsigned millisecond) {
 #undef DEFINE_EXTERNAL_API_TYPE
 #endif  // PADDLE_WITH_HIP
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+/**************************************************************************/
+/***************************** MUSA ERROR **********************************/
+#ifdef PADDLE_WITH_MUSA
+
+/***** MUSA ERROR *****/
+inline bool is_error(musaError_t e) { return e != musaSuccess; }
+
+inline std::string build_musa_error_msg(musaError_t e) {
+  std::ostringstream sout;
+  sout << " Musa error(" << e << "), " << musaGetErrorString(e) << ".";
+  return sout.str();
+}
+
+/***** MURAND ERROR *****/
+inline bool is_error(murandStatus_t stat) {
+  return stat != MURAND_STATUS_SUCCESS;
+}
+
+inline const char* murandGetErrorString(murandStatus_t stat) {
+  switch (stat) {
+    case MURAND_STATUS_SUCCESS:
+      return "MURAND_STATUS_SUCCESS";
+    case MURAND_STATUS_VERSION_MISMATCH:
+      return "MURAND_STATUS_VERSION_MISMATCH";
+    case MURAND_STATUS_NOT_CREATED:
+      return "MURAND_STATUS_NOT_CREATED";
+    case MURAND_STATUS_ALLOCATION_FAILED:
+      return "MURAND_STATUS_ALLOCATION_FAILED";
+    case MURAND_STATUS_TYPE_ERROR:
+      return "MURAND_STATUS_TYPE_ERROR";
+    case MURAND_STATUS_OUT_OF_RANGE:
+      return "MURAND_STATUS_OUT_OF_RANGE";   
+    case MURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "MURAND_STATUS_LENGTH_NOT_MULTIPLE";   
+    case MURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "MURAND_STATUS_DOUBLE_PRECISION_REQUIRED";                
+    case MURAND_STATUS_LAUNCH_FAILURE:
+      return "MURAND_STATUS_LAUNCH_FAILURE";          
+    case MURAND_STATUS_INTERNAL_ERROR:
+      return "MURAND_STATUS_INTERNAL_ERROR";      
+    case MURAND_STATUS_NOT_IMPLEMENTED:
+      return "MURAND_STATUS_NOT_IMPLEMENTED";                    
+    default:
+      return "Unknown murand status";
+  }
+}
+
+inline std::string build_musa_error_msg(murandStatus_t stat) {
+  std::string msg(" Murand error, ");
+  return msg + murandGetErrorString(stat) + " ";
+}
+
+/***** mudnn ERROR *****/
+// inline bool is_error(mudnnStatus_t stat) {
+//   return stat != cudnnStatusSuccess;
+// }
+
+// inline std::string build_rocm_error_msg(miopenStatus_t stat) {
+//   std::string msg(" Miopen error, ");
+//   return msg + phi::dynload::miopenGetErrorString(stat) + " ";
+// }
+
+/***** MUBLAS ERROR *****/
+inline bool is_error(mublasStatus stat) {
+  return stat != MUBLAS_STATUS_SUCCESS;
+}
+
+inline const char* mublasGetErrorString(mublasStatus stat) {
+  switch (stat) {
+    case MUBLAS_STATUS_SUCCESS:
+      return "MUBLAS_STATUS_SUCCESS";
+    case MUBLAS_STATUS_INVALID_HANDLE:
+      return "MUBLAS_STATUS_INVALID_HANDLE";
+    case MUBLAS_STATUS_NOT_IMPLEMENTED:
+      return "MUBLAS_STATUS_NOT_IMPLEMENTED";
+    case MUBLAS_STATUS_INVALID_POINTER:
+      return "MUBLAS_STATUS_INVALID_POINTER";
+    case MUBLAS_STATUS_INVALID_SIZE:
+      return "MUBLAS_STATUS_INVALID_SIZE";    
+    case MUBLAS_STATUS_MEMORY_ERROR:
+      return "MUBLAS_STATUS_MEMORY_ERROR";
+    case MUBLAS_STATUS_INTERNAL_ERROR:
+      return "MUBLAS_STATUS_INTERNAL_ERROR";
+    case MUBLAS_STATUS_PERF_DEGRADED:
+      return "MUBLAS_STATUS_PERF_DEGRADED";
+    case MUBLAS_STATUS_SIZE_QUERY_MISMATCH:
+      return "MUBLAS_STATUS_SIZE_QUERY_MISMATCH";
+    case MUBLAS_STATUS_SIZE_INCREASED:
+      return "MUBLAS_STATUS_SIZE_INCREASED";     
+    case MUBLAS_STATUS_SIZE_UNCHANGED:
+      return "MUBLAS_STATUS_SIZE_UNCHANGED";       
+    case MUBLAS_STATUS_INVALID_VALUE:
+      return "MUBLAS_STATUS_INVALID_VALUE";       
+    case MUBLAS_STATUS_CONTINUE:
+      return "MUBLAS_STATUS_CONTINUE";       
+    case MUBLAS_STATUS_CHECK_NUMERICS_FAIL:
+      return "MUBLAS_STATUS_CHECK_NUMERICS_FAIL";                                                                      
+    default:
+      return "Unknown mublas status";
+  }
+}
+
+inline std::string build_musa_error_msg(mublasStatus stat) {
+  std::string msg(" mublas error, ");
+  return msg + mublasGetErrorString(stat) + " ";
+}
+
+/****** MCCL ERROR ******/
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+inline bool is_error(mcclResult_t mccl_result) {
+  return mccl_result != mcclSuccess;
+}
+
+inline std::string build_musa_error_msg(mcclResult_t mccl_result) {
+  std::string msg(" Mccl error, ");
+  return msg + phi::dynload::mcclGetErrorString(mccl_result) + " ";
+}
+#endif  // not(__APPLE__) and PADDLE_WITH_MCCL
+
+/***** MUFFT ERROR *****/
+inline bool is_error(mufftResult_t stat) { return stat != MUFFT_SUCCESS; }
+
+inline std::string build_musa_error_msg(mufftResult_t stat) {
+  std::string msg(" MUFFT error, ");
+  return msg + phi::dynload::mufftGetErrorString(stat) + " ";
+}
+
+namespace details {
+
+template <typename T>
+struct ExternalApiType {};
+
+#define DEFINE_EXTERNAL_API_TYPE(type, success_value) \
+  template <>                                         \
+  struct ExternalApiType<type> {                      \
+    using Type = type;                                \
+    static constexpr Type kSuccess = success_value;   \
+  }
+
+DEFINE_EXTERNAL_API_TYPE(musaError_t, musaSuccess);
+DEFINE_EXTERNAL_API_TYPE(murandStatus_t, MURAND_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mublasStatus, MUBLAS_STATUS_SUCCESS);
+DEFINE_EXTERNAL_API_TYPE(mufftResult_t, MUFFT_SUCCESS);
+
+#if !defined(__APPLE__) && defined(PADDLE_WITH_MCCL)
+DEFINE_EXTERNAL_API_TYPE(mcclResult_t, mcclSuccess);
+#endif
+
+}  // namespace details
+
+#define PADDLE_ENFORCE_GPU_SUCCESS(COND)                   \
+  do {                                                     \
+    auto __cond__ = (COND);                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
+    constexpr auto __success_type__ =                      \
+        ::phi::enforce::details::ExternalApiType<          \
+            __CUDA_STATUS_TYPE__>::kSuccess;               \
+    if (UNLIKELY(__cond__ != __success_type__)) {          \
+      auto __summary__ = phi::errors::External(            \
+          ::phi::enforce::build_musa_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);               \
+    }                                                      \
+  } while (0)
+
+#define PADDLE_WARN_GPU_SUCCESS(COND)                      \
+  do {                                                     \
+    auto __cond__ = (COND);                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);       \
+    constexpr auto __success_type__ =                      \
+        ::phi::enforce::details::ExternalApiType<          \
+            __CUDA_STATUS_TYPE__>::kSuccess;               \
+    if (UNLIKELY(__cond__ != __success_type__)) {          \
+      ::phi::enforce::ThrowWarnInternal(                   \
+          ::phi::enforce::build_musa_error_msg(__cond__)); \
+    }                                                      \
+  } while (0)
+
+inline void retry_sleep(unsigned millisecond) {
+#ifdef _WIN32
+  Sleep(millisecond);
+#else
+  sleep(millisecond);
+#endif
+}
+
+#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    int retry_count = 1;                                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
+    constexpr auto __success_type__ =                                   \
+        ::phi::enforce::details::ExternalApiType<                       \
+            __CUDA_STATUS_TYPE__>::kSuccess;                            \
+    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
+      ::phi::enforce::retry_sleep(10000);                               \
+      __cond__ = (COND);                                                \
+      ++retry_count;                                                    \
+    }                                                                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = phi::errors::External(                         \
+          ::phi::enforce::build_musa_error_msg(__cond__));              \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+
+#undef DEFINE_EXTERNAL_API_TYPE
+#endif  // PADDLE_WITH_MUSA
+
+
+
+
+
+
 }  // namespace enforce
 using namespace enforce;  // NOLINT
 }  // namespace phi
diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
index a6764dfcf1c31f..9304b42be1644a 100644
--- a/paddle/phi/core/flags.cc
+++ b/paddle/phi/core/flags.cc
@@ -14,7 +14,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/flags.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
 #endif
 
@@ -120,7 +120,7 @@ PHI_DEFINE_EXPORTED_bool(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 /**
  * CUDA related related FLAG
@@ -215,7 +215,7 @@ PHI_DEFINE_EXPORTED_bool(
     true,
     "Whether enable api kernel fallback to CPU one when not found");
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 /**
  * CUDNN related FLAG
  * Name: FLAGS_cudnn_deterministic
@@ -322,7 +322,7 @@ PHI_DEFINE_EXPORTED_bool(
     "batch_norm, default is False.");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 /**
  * NCCL related FLAG
@@ -541,7 +541,7 @@ PHI_DEFINE_EXPORTED_double(
 
 // NOTE(zhiqiu): better to share the flags, otherwise we will have too many
 // flags.
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
 
 /**
@@ -785,7 +785,7 @@ PHI_DEFINE_EXPORTED_string(tracer_mkldnn_ops_off,
  * Example:
  * Note: Check kernel launch status after every kernel compute.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(
     check_kernel_launch,
     false,
@@ -800,7 +800,7 @@ PHI_DEFINE_EXPORTED_bool(
  * Example:
  * Note: Disable cudnn in conv2d.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(conv2d_disable_cudnn,
                          false,
                          "Disable cudnn in conv2d");
@@ -819,7 +819,7 @@ PHI_DEFINE_EXPORTED_bool(use_fast_math,
  * Note: Get host by name time.
  */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
-    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_CUSTOM_DEVICE)
+    defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE)
 PHI_DEFINE_EXPORTED_int32(get_host_by_name_time,
                           120,
                           "The maximum time for get host by name time");
@@ -1190,11 +1190,11 @@ PHI_DEFINE_EXPORTED_bool(multi_node_sample_use_gpu_table,
  * Note: nccl blocking wait.
  */
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait");
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PHI_DEFINE_EXPORTED_bool(benchmark_nccl,
                          false,
                          "enable nccl debug mode to synchronize nccl comm");
@@ -1428,7 +1428,7 @@ PHI_DEFINE_EXPORTED_int32(
 
 PHI_DEFINE_EXPORTED_bool(print_ir, false, "Whether print ir debug str.");
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || \
     defined(PADDLE_WITH_XPU_BKCL)
 /**
  * Communication library related FLAG
diff --git a/paddle/phi/core/generator.cc b/paddle/phi/core/generator.cc
index 82d37be80d3c36..978552e13c0e8a 100644
--- a/paddle/phi/core/generator.cc
+++ b/paddle/phi/core/generator.cc
@@ -63,7 +63,7 @@ const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
 }
 
 const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
   static int64_t num_cuda_devices = -1;
   static std::once_flag num_devices_init_flag;
@@ -278,8 +278,7 @@ uint64_t Generator::Random64() {
 
 std::pair<uint64_t, uint64_t> Generator::IncrementOffset(
     uint64_t increment_offset) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_CUSTOM_DEVICE)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || defined(PADDLE_WITH_CUSTOM_DEVICE)
   std::lock_guard<std::mutex> lock(this->mu_);
   uint64_t cur_offset = this->state_.thread_offset;
   VLOG(10) << "cur_offset = " << cur_offset
diff --git a/paddle/phi/core/hostdevice.h b/paddle/phi/core/hostdevice.h
index decebbe66a5381..3295a2f6b37399 100644
--- a/paddle/phi/core/hostdevice.h
+++ b/paddle/phi/core/hostdevice.h
@@ -18,6 +18,10 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef __MUSACC__
+#include <musa_runtime.h>
+#endif
+
 #if defined(__xpu__)
 #include <xpu/runtime.h>
 
@@ -26,7 +30,7 @@
 #include "xpu/kernel/math.h"
 #endif
 
-#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__))
+#if (defined(__CUDACC__) || defined(__HIPCC__) || defined(__xpu__) || defined(__MUSACC__))
 #define HOSTDEVICE __host__ __device__
 #define DEVICE __device__
 #define HOST __host__
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index a5c5a3994a81b1..6e534511802bb9 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -124,7 +124,7 @@ const Kernel& KernelFactory::SelectKernelWithGPUDNN(
     return empty_kernel;
   }
   KernelKey kernel_key = KernelKey(const_kernel_key);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
@@ -239,7 +239,7 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
   KernelKey kernel_key = KernelKey(const_kernel_key.backend(),
                                    phi::DataLayout::ALL_LAYOUT,
                                    const_kernel_key.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (kernel_key.backend() == Backend::GPUDNN) {
     auto kernel_iter = iter->second.find(
         {Backend::GPUDNN, phi::DataLayout::ALL_LAYOUT, kernel_key.dtype()});
diff --git a/paddle/phi/core/kernel_registry.cc b/paddle/phi/core/kernel_registry.cc
index fa9d531b6534d6..77ae9b45c9d682 100644
--- a/paddle/phi/core/kernel_registry.cc
+++ b/paddle/phi/core/kernel_registry.cc
@@ -34,7 +34,7 @@ void SetKernelArgsDef(const std::vector<std::type_index>& args_type,
 #if defined(PADDLE_WITH_DNNL)
         || arg_type == std::type_index(typeid(const OneDNNContext&))
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
         || arg_type == std::type_index(typeid(const GPUContext&))
 #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
         || arg_type == std::type_index(typeid(const XPUContext&))
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index b24e39b6c75bf1..19f76f60f9a1ba 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -1199,7 +1199,7 @@ struct KernelRegistrar {
                                             meta_kernel_fn,        \
                                             BACKEND_LIST_EXCEPT_CUSTOM)
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #define _DEVICE GPU,
 #elif defined(PADDLE_WITH_XPU)
 #define _DEVICE XPU,
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 715b4f76392d8f..3b55ccd3dbc365 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -300,7 +300,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   /* DeviceContext Helpers */
 
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   PD_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(GPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/core/mixed_vector.cc b/paddle/phi/core/mixed_vector.cc
index 857bd546befcdf..aba6a0f7bfca27 100644
--- a/paddle/phi/core/mixed_vector.cc
+++ b/paddle/phi/core/mixed_vector.cc
@@ -33,7 +33,7 @@ template <typename T>
 void CopyToCPUHelper(std::vector<T> *cpu_,
                      phi::Allocator::AllocationPtr *gpu_,
                      size_t *gpu_memory_size_) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // COPY GPU Data To CPU
   auto *dev_ctx = static_cast<phi::GPUContext *>(
       phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
@@ -55,7 +55,7 @@ void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
                              phi::Allocator::AllocationPtr *gpu_,
                              size_t *gpu_memory_size_,
                              const phi::Place &place) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   void *src = cpu_->data();
   *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
   (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
diff --git a/paddle/phi/core/string_tensor.cc b/paddle/phi/core/string_tensor.cc
index 35c59c2d8d787d..700db5e8d4382e 100644
--- a/paddle/phi/core/string_tensor.cc
+++ b/paddle/phi/core/string_tensor.cc
@@ -116,9 +116,11 @@ void StringTensor::init_holder() {
   if (place.GetType() == phi::AllocationType::CPU) {
     std::memset(ptr, 0, bytes_size);
   } else if (place.GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #ifdef PADDLE_WITH_HIP
     hipMemset(ptr, 0, bytes_size);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(ptr, 0, bytes_size);
 #else
     cudaMemset(ptr, 0, bytes_size);
 #endif
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index 17fdef1b9cfbdd..03d8b3a0f661ee 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -65,7 +65,7 @@ void Copy(const Context& dev_ctx,
 #ifdef PADDLE_WITH_DNNL
     dst->set_layout(src.layout());
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if (dst_place.GetType() == AllocationType::GPU ||
              dst_place.GetType() == AllocationType::GPUPINNED) {
     dst_ptr = dev_ctx.Alloc(
@@ -106,7 +106,7 @@ void Copy(const Context& dev_ctx,
   if (src_place.GetType() == AllocationType::CPU &&
       dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(src_place, dst_ptr, src_place, src_ptr, size);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   } else if ((src_place.GetType() == AllocationType::CPU ||
               src_place.GetType() == AllocationType::GPUPINNED) &&  // NOLINT
              (dst_place.GetType() == AllocationType::CPU ||
@@ -394,7 +394,7 @@ template void Copy(const DeviceContext& dev_ctx,
                    bool blocking,
                    TensorArray* dst);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template void Copy(const GPUContext& dev_ctx,
                    const DenseTensor& src,
                    Place dst_place,
@@ -476,7 +476,7 @@ void TensorFromVector(const std::vector<T>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -530,7 +530,7 @@ void TensorFromVector(const std::vector<bool>& src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -622,7 +622,7 @@ void TensorFromArray(const T* src,
   if (dst_place.GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (dst_place.GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -722,7 +722,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
@@ -764,7 +764,7 @@ void TensorToVector(const phi::DenseTensor& src,
   if (src.place().GetType() == AllocationType::CPU) {
     memory_utils::Copy(dst_place, dst_ptr, src.place(), src_ptr, size);
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   else if (src.place().GetType() == AllocationType::GPU) {  // NOLINT
     memory_utils::Copy(dst_place,
                        dst_ptr,
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index 449d7cbe8966df..ea1caf4ac067d8 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -211,34 +211,35 @@ inline int TransToProtoVarType(const DataType& dtype) {
   }
 }
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-inline ncclDataType_t ToNCCLDataType(DataType type) {
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
+inline mcclDataType_t ToNCCLDataType(DataType type) {
   if (type == DataType::FLOAT32) {
-    return ncclFloat;
+    return mcclFloat;
   } else if (type == DataType::FLOAT64) {
-    return ncclDouble;
+    return mcclDouble;
   } else if (type == DataType::INT32) {
-    return ncclInt;
+    return mcclInt;
   } else if (type == DataType::INT64) {
-    return ncclInt64;
+    return mcclInt64;
   } else if (type == DataType::FLOAT16) {
-    return ncclFloat16;
+    return mcclFloat16;
   } else if (type == DataType::UINT8) {
-    return ncclUint8;
+    return mcclUint8;
   } else if (type == DataType::INT8) {
-    return ncclInt8;
+    return mcclInt8;
   } else if (type == DataType::BOOL) {
-    return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
-  } else if (type == DataType::BFLOAT16) {
-    return ncclBfloat16;
-#endif
+    return mcclUint8;
+  // } else if (type == DataType::BFLOAT16) {
+  //   return ncclBfloat16;
   } else {
     PADDLE_THROW(
         errors::Unimplemented("This datatype in nccl is not supported."));
   }
 }
 #endif
+
+
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 inline BKCLDataType ToBKCLDataType(DataType type) {
   if (type == DataType::FLOAT32) {
diff --git a/paddle/phi/core/utils/type_info.cc b/paddle/phi/core/utils/type_info.cc
index b419338401eeac..63c9cf63f9a320 100644
--- a/paddle/phi/core/utils/type_info.cc
+++ b/paddle/phi/core/utils/type_info.cc
@@ -54,12 +54,12 @@ template class TypeInfoTraits<phi::TensorBase, phi::distributed::DistTensor>;
 template class TypeInfoTraits<phi::DeviceContext, CPUContext>;
 template class TypeInfoTraits<phi::DeviceContext, CustomContext>;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_XPU_KP)
 template class TypeInfoTraits<phi::DeviceContext, GPUContext>;
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template class TypeInfoTraits<phi::DeviceContext, GPUPinnedContext>;
 #endif
 
diff --git a/paddle/phi/core/utils/visit_place.h b/paddle/phi/core/utils/visit_place.h
index 6318b17647cd61..34a8fca61fbbee 100644
--- a/paddle/phi/core/utils/visit_place.h
+++ b/paddle/phi/core/utils/visit_place.h
@@ -25,7 +25,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
                                          const Visitor& visitor) {
   switch (place.GetType()) {
     case phi::AllocationType::GPU: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::GPUPlace p(place.GetDeviceId());
       return visitor(p);
 #else
@@ -35,7 +35,7 @@ typename Visitor::result_type VisitPlace(const phi::Place& place,
 #endif
     }
     case phi::AllocationType::GPUPINNED: {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       phi::GPUPinnedPlace p;
       return visitor(p);
 #else
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index 7ee12e26d7d0ef..296db9b1781987 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -150,7 +150,7 @@ namespace phi {
 
 ///////// BOOL and Floating and Integral Dispatch Marco ///////////
 
-#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)
+#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)  && !defined(PADDLE_WITH_MCCL)
 #define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...)    \
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index eee92aa1380449..ac3eb1f3cc12fc 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -932,7 +932,7 @@ void CoalesceTensorInferMeta(const std::vector<const MetaTensor*>& input,
     size_of_dtype = static_cast<int>(phi::SizeOf(dtype));
   }
   if (config.is_runtime) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     int64_t numel = 0;
     for (auto item : input) {
       const auto& dim = item->dims();
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index f38a842a669873..6551d4e9b8c74d 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -22,6 +22,9 @@ add_subdirectory(autotune)
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
 
 file(GLOB kernel_h "*.h" "selected_rows/*.h" "sparse/*.h" "strings/*.h")
+if(WITH_MUSA)
+  list(REMOVE_ITEM kernel_cu "sparse/*.h")
+endif()
 file(GLOB kernel_impl_h "impl/*.h" "selected_rows/impl/*.h")
 file(GLOB kernel_primitive_h "primitive/*.h")
 
@@ -40,6 +43,43 @@ file(
   "strings/gpu/*.cu"
   "fusion/gpu/*.cu")
 
+if(WITH_MUSA)
+  # 创建要排除的文件模式列表
+  file(
+  GLOB files_to_remove
+  RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+  "sparse/gpu/*.cu"
+  "gpudnn/*.cu")
+
+  list(REMOVE_ITEM kernel_cu ${files_to_remove})
+  message(STATUS "files_to_remove:${files_to_remove}")
+
+  list(
+    REMOVE_ITEM 
+    kernel_cu 
+    "strings/gpu/strings_lower_upper_kernel.cu" 
+    "strings/gpu/strings_copy_kernel.cu" 
+    "fusion/gpu/block_multi_head_attention_kernel.cu"
+    "gpu/cudnn_lstm_kernel.cu"
+    "gpu/cudnn_lstm_grad_kernel.cu"
+    "gpu/instance_norm_kernel.cu"
+    "gpu/instance_norm_grad_kernel.cu"
+    "gpu/log_softmax_kernel.cu"
+    "gpu/log_softmax_grad_kernel.cu"
+    "gpu/weighted_sample_neighbors_kernel.cu"
+    "gpu/cross_entropy_kernel.cu"
+    "gpu/cross_entropy_grad_kernel.cu"
+    "gpu/gelu_kernel.cu"
+    "gpu/gelu_grad_kernel.cu"
+    "gpu/rnn_kernel.cu.cc"
+    "gpu/rnn_grad_kernel.cu.cc"
+    "gpu/clip_by_norm_kernel.cu"
+    "selected_rows/gpu/clip_by_norm_kernel.cu"
+    "gpu/softmax_grad_kernel.cu"
+    "gpu/softmax_kernel.cu"
+    )
+endif()
+
 if(APPLE OR WIN32)
   list(REMOVE_ITEM kernel_cu "fusion/gpu/fusion_group_kernel.cu")
 endif()
@@ -219,6 +259,10 @@ set(cc_search_pattern
     "fusion/*.cc"
     "stride/*.cc"
     "fusion/cpu/*.cc")
+if(WITH_MUSA)
+    list(REMOVE_ITEM cc_search_pattern "sparse/*.cc")
+    list(REMOVE_ITEM cc_search_pattern "sparse/cpu/*.cc")    
+endif()
 
 if(WITH_MKLDNN)
   set(cc_search_pattern ${cc_search_pattern} "legacy/onednn/*.cc" "onednn/*.cc"
@@ -252,7 +296,7 @@ file(
   "xpu/*.cc" "legacy/xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc"
   "sparse/xpu/*.cc")
 
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   collect_srcs(kernels_srcs SRCS ${kernel_cu})
   kernel_declare("${kernel_cu}")
 endif()
diff --git a/paddle/phi/kernels/array_kernel.cc b/paddle/phi/kernels/array_kernel.cc
index 8a599dcf9d80d8..5389a26479213a 100644
--- a/paddle/phi/kernels/array_kernel.cc
+++ b/paddle/phi/kernels/array_kernel.cc
@@ -134,7 +134,7 @@ PD_REGISTER_KERNEL(create_array,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(create_array,
                    GPU,
                    ALL_LAYOUT,
@@ -178,7 +178,7 @@ PD_REGISTER_KERNEL(array_read,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(array_read,
                    GPU,
                    ALL_LAYOUT,
@@ -208,7 +208,7 @@ PD_REGISTER_KERNEL(array_write,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(array_write,
                    GPU,
                    ALL_LAYOUT,
@@ -238,7 +238,7 @@ PD_REGISTER_KERNEL(array_to_tensor,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(array_to_tensor,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/assign_kernel.cc b/paddle/phi/kernels/assign_kernel.cc
index b4504f83818d77..eb884d53f3cd63 100644
--- a/paddle/phi/kernels/assign_kernel.cc
+++ b/paddle/phi/kernels/assign_kernel.cc
@@ -139,7 +139,7 @@ PD_REGISTER_KERNEL(assign_value,
                    int8_t,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index b04c46351c2cfd..01ba364ad3d3d5 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -30,11 +30,15 @@
 #include <hip/hip_runtime.h>
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#endif
+
 namespace phi {
 
-#ifdef PADDLE_WITH_HIP
-static void RecordEventTimerCallback(hipStream_t stream,
-                                     hipError_t status,
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+static void RecordEventTimerCallback(musaStream_t stream,
+                                     musaError_t status,
                                      void *user_data) {
   struct timeval time_now {};
   gettimeofday(&time_now, nullptr);
@@ -60,6 +64,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventCreate(&start_);
     hipEventCreate(&stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventCreate(&start_);
+    musaEventCreate(&stop_);    
 #else
     cudaEventCreate(&start_);
     cudaEventCreate(&stop_);
@@ -74,6 +81,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventDestroy(start_);
     hipEventDestroy(stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventDestroy(start_);
+    musaEventDestroy(stop_);
 #else
     cudaEventDestroy(start_);
     cudaEventDestroy(stop_);
@@ -83,6 +93,8 @@ class GpuTimer {
   void Start(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(start_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(start_, stream);
 #else
     cudaEventRecord(start_, stream);
 #endif
@@ -91,6 +103,8 @@ class GpuTimer {
   void Stop(gpuStream_t stream) {
 #ifdef PADDLE_WITH_HIP
     hipEventRecord(stop_, stream);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventRecord(stop_, stream);
 #else
     cudaEventRecord(stop_, stream);
 #endif
@@ -101,6 +115,9 @@ class GpuTimer {
 #ifdef PADDLE_WITH_HIP
     hipEventSynchronize(stop_);
     hipEventElapsedTime(&milliseconds, start_, stop_);
+#elif defined(PADDLE_WITH_MUSA)
+    musaEventSynchronize(stop_);
+    musaEventElapsedTime(&milliseconds, start_, stop_);
 #else
     cudaEventSynchronize(stop_);
     cudaEventElapsedTime(&milliseconds, start_, stop_);
@@ -144,6 +161,12 @@ class CalculateStreamTimer {
                                RecordEventTimerCallback,
                                reinterpret_cast<void *>(&start_time_),
                                0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamAddCallback(calculated_stream_,
+                               RecordEventTimerCallback,
+                               reinterpret_cast<void *>(&start_time_),
+                               0));                              
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamAddCallback(calculated_stream_,
@@ -163,6 +186,12 @@ class CalculateStreamTimer {
                                RecordEventTimerCallback,
                                reinterpret_cast<void *>(&end_time_),
                                0));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaStreamAddCallback(calculated_stream_,
+                               RecordEventTimerCallback,
+                               reinterpret_cast<void *>(&end_time_),
+                               0));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaStreamAddCallback(calculated_stream_,
@@ -178,6 +207,8 @@ class CalculateStreamTimer {
     if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_));
 #endif
@@ -189,6 +220,8 @@ class CalculateStreamTimer {
     if (calculated_stream_ != nullptr) {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(hipStreamSynchronize(calculated_stream_));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaStreamSynchronize(calculated_stream_));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(calculated_stream_));
 #endif
diff --git a/paddle/phi/kernels/batch_norm_kernel.cc b/paddle/phi/kernels/batch_norm_kernel.cc
index bf04c99dab0a3c..dba08b0de366af 100644
--- a/paddle/phi/kernels/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/batch_norm_kernel.cc
@@ -97,7 +97,7 @@ PD_REGISTER_KERNEL(batch_norm_infer,
 }
 #endif
 #endif
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(batch_norm_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/check_memory_continue_kernel.cc b/paddle/phi/kernels/check_memory_continue_kernel.cc
index 6e496a355302fc..9f4b51281cd37f 100644
--- a/paddle/phi/kernels/check_memory_continue_kernel.cc
+++ b/paddle/phi/kernels/check_memory_continue_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(check_memory_continue,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(check_memory_continue,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc
index a60369af449f4e..2d0ab05a8de78b 100644
--- a/paddle/phi/kernels/coalesce_tensor_kernel.cc
+++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc
@@ -309,6 +309,20 @@ PD_REGISTER_KERNEL(coalesce_tensor,
 }
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+PD_REGISTER_KERNEL(coalesce_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CoalesceTensorKernel,
+                   phi::dtype::float16,
+                   int,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED);
+}
+#endif
+
 #ifdef PADDLE_WITH_XPU
 PD_REGISTER_KERNEL(coalesce_tensor,
                    XPU,
diff --git a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc
index aceced1ce85313..0b11e3d6f98da9 100644
--- a/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc
+++ b/paddle/phi/kernels/cpu/decode_jpeg_kernel.cc
@@ -29,4 +29,4 @@ void DecodeJpegKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {}
+    decode_jpeg, CPU, ALL_LAYOUT, phi::DecodeJpegKernel, uint8_t) {}
\ No newline at end of file
diff --git a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
index 65ee3c1851003e..81ed7170d7a24f 100644
--- a/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_grad_kernel.cc
@@ -64,7 +64,7 @@ struct GeluGradFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto dx_data = dx.data();
       auto dout_data = dout.data();
diff --git a/paddle/phi/kernels/cpu/gelu_kernel.cc b/paddle/phi/kernels/cpu/gelu_kernel.cc
index dbab3bd3266649..47ab1a78390662 100644
--- a/paddle/phi/kernels/cpu/gelu_kernel.cc
+++ b/paddle/phi/kernels/cpu/gelu_kernel.cc
@@ -53,7 +53,7 @@ struct GeluFunctor {
     } else {
 #if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
     !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) &&                       \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
       auto x_data = x.data();
       auto out_data = out.data();
       int n = std::min(x.size(), out.size());
diff --git a/paddle/phi/kernels/dist_grad_kernel.cc b/paddle/phi/kernels/dist_grad_kernel.cc
index 088a4fe4ffd266..170f9a3a4d6082 100644
--- a/paddle/phi/kernels/dist_grad_kernel.cc
+++ b/paddle/phi/kernels/dist_grad_kernel.cc
@@ -97,7 +97,7 @@ void DistGradKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     dist_grad, CPU, ALL_LAYOUT, phi::DistGradKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(dist_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index d2391a5702d4b1..60fc5236abc940 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -74,7 +74,7 @@ PD_REGISTER_KERNEL(empty_like,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index ebe1b1d24e50a5..2b7c400bc64641 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index dc61e6a650efa1..6b22ac75181791 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -75,7 +75,7 @@ PD_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(flatten_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
index cd603dd57e64d1..1886f5af4c1cb7 100644
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(full_batch_size_like,
                    bool) {
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_batch_size_like,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index d124e269e5c007..703ca719902ac7 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -8,16 +8,20 @@ file(
   GLOB func_cc_srcs
   RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
   "*.cc")
-if(WITH_GPU OR WITH_ROCM)
+if(WITH_GPU OR WITH_ROCM OR WITH_MUSA)
   file(
     GLOB func_cu_srcs
     RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
     "*.cu")
 endif()
 
-# Note(qili93): remove kernels not supported on DCU yet
 if(WITH_ROCM)
   list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu")
+if(WITH_MUSA)
+  list(REMOVE_ITEM func_cu_srcs 
+      "cross_entropy.cu" 
+      "gru_compute.cu"
+      "softmax.cu")
 endif()
 
 collect_srcs(kernels_srcs SRCS ${func_cc_srcs} ${func_cu_srcs})
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 06b59644cf11d4..dcad9755ee4e05 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -3013,7 +3013,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 
 template <typename T>
 struct CudaLogitFunctor : public BaseActivationFunctor<T> {
diff --git a/paddle/phi/kernels/funcs/algorithm.h b/paddle/phi/kernels/funcs/algorithm.h
index 5f66f6f1abd4d2..cab4d32a998268 100644
--- a/paddle/phi/kernels/funcs/algorithm.h
+++ b/paddle/phi/kernels/funcs/algorithm.h
@@ -40,7 +40,7 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group LowerBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)  // @{ Group LowerBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/lower_bound
   auto *first = x;
@@ -63,7 +63,7 @@ HOSTDEVICE inline size_t LowerBound(const T1 *x, size_t num, const T2 &val) {
 
 template <typename T1, typename T2>
 HOSTDEVICE inline size_t UpperBound(const T1 *x, size_t num, const T2 &val) {
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  // @{ Group UpperBound
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)  // @{ Group UpperBound
   // The following code is from
   // https://en.cppreference.com/w/cpp/algorithm/upper_bound
   auto *first = x;
diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h
index 140eca890480f9..69e13d29874d51 100644
--- a/paddle/phi/kernels/funcs/blas/blas.h
+++ b/paddle/phi/kernels/funcs/blas/blas.h
@@ -175,7 +175,7 @@ class Blas {
              T* c,
              const int* ldc) const;
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void MatMulWithHead(const phi::DenseTensor& mat_a,
                       const MatDescriptor& dim_a,
@@ -303,7 +303,7 @@ class Blas {
                    int batchCount) const;
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
                            CBLAS_TRANSPOSE transB,
@@ -360,7 +360,7 @@ class Blas {
             T* B,
             int ldb) const;
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   template <typename T>
   void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
 
@@ -445,7 +445,7 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template CSRMM<T>(args...);
   }
 
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void MatMulWithHead(ARGS... args) const {
     Base()->template MatMulWithHead<T>(args...);
@@ -543,7 +543,7 @@ class BlasT : private Blas<DeviceContext> {
     Base()->template TRSM<T>(args...);
   }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   template <typename... ARGS>
   void BatchedGETRF(ARGS... args) const {
     Base()->template BatchedGETRF<T>(args...);
@@ -593,3 +593,7 @@ inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/kernels/funcs/blas/blas_impl.mu.h"
+#endif
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index ffafe15b8fcf2d..a4233d9a4147ac 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1451,7 +1451,7 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif
 }
 
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
+#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_MUSA) && \
     !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
@@ -1698,7 +1698,7 @@ void Blas<DeviceContext>::MatMul(const T *mat_a,
 }
 
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
+    !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
 // @{ Group Blas MKLML: MatMulWithHead
 /*
  * Multiple two matrixes with multiple heads
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.mu.h b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
new file mode 100644
index 00000000000000..c6391acab6d894
--- /dev/null
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.mu.h
@@ -0,0 +1,1602 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(__MUSACC__)
+#include <thrust/device_vector.h>
+#endif
+#include "glog/logging.h"
+#include "paddle/utils/flags.h"
+
+#include "paddle/phi/backends/dynload/mublas.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+PHI_DECLARE_bool(enable_cublas_tensor_op_math);
+PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+struct CUBlas;
+
+template <>
+struct CUBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemm(args...));
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSaxpy(args...));
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSscal(args...));
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasScopy(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemv(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mublasSgemmStridedBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_EX(phi::GPUContext *dev_ctx,
+                      mublasOperation_t transa,
+                      mublasOperation_t transb,
+                      int m,
+                      int n,
+                      int k,
+                      const float *alpha,
+                      const void *A,
+                      musaDataType_t Atype,
+                      int lda,
+                      const void *B,
+                      musaDataType_t Btype,
+                      int ldb,
+                      const float *beta,
+                      void *C,
+                      musaDataType_t Ctype,
+                      int ldc) {
+// Because the gcc 4.8 doesn't expand template parameter pack that
+// appears in a lambda-expression, I can not use template parameter pack
+// here.
+    // VLOG(5) << "use_tensor_op_math: "
+    //         << (dev_ctx->tensor_core_available() ? "True" : "False");
+    // dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
+    //   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgemmEx(handle,
+    //                                                          transa,
+    //                                                          transb,
+    //                                                          m,
+    //                                                          n,
+    //                                                          k,
+    //                                                          alpha,
+    //                                                          A,
+    //                                                          Atype,
+    //                                                          lda,
+    //                                                          B,
+    //                                                          Btype,
+    //                                                          ldb,
+    //                                                          beta,
+    //                                                          C,
+    //                                                          Ctype,
+    //                                                          ldc));
+    // });
+      PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasSgemmEx."));
+  }
+
+  template <typename... ARGS>
+  static void TRSM(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsm(args...));
+  }
+
+  template <typename... ARGS>
+  static void GETRF_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrfBatched(args...));
+    PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasSgetrfBatched."));
+  }
+
+  template <typename... ARGS>
+  static void GETRI_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetriBatched(args...));
+    PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasSgetriBatched."));
+  }
+
+  template <typename... ARGS>
+  static void MATINV_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSmatinvBatched(args...));
+    PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasSmatinvBatched."));
+  }
+
+  template <typename... ARGS>
+  static void GETRS_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasSgetrsBatched(args...));
+      PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasSgetrsBatched."));
+  }
+
+  template <typename... ARGS>
+  static void TRSM_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasStrsmBatched(args...));
+  }
+};
+
+template <>
+struct CUBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemm(args...));
+  }
+
+  template <typename... ARGS>
+  static void AXPY(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDaxpy(args...));
+  }
+
+  template <typename... ARGS>
+  static void SCAL(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDscal(args...));
+  }
+
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDcopy(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMV(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemv(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgemmBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_STRIDED_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mublasDgemmStridedBatched(args...));
+  }
+
+  template <typename... ARGS>
+  static void GEMM_EX(ARGS... args UNUSED) {
+    PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasDgemmEx."));
+  }
+
+  template <typename... ARGS>
+  static void TRSM(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsm(args...));
+  }
+
+  template <typename... ARGS>
+  static void GETRF_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrfBatched(args...));
+     PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasDgetrfBatched."));   
+  }
+
+  template <typename... ARGS>
+  static void GETRI_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetriBatched(args...));
+     PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasDgetriBatched."));       
+  }
+
+  template <typename... ARGS>
+  static void MATINV_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDmatinvBatched(args...));
+     PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasDmatinvBatched."));       
+  }
+
+  template <typename... ARGS>
+  static void GETRS_BATCH(ARGS... args) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDgetrsBatched(args...));
+     PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasDgetrsBatched."));        
+  }
+
+  template <typename... ARGS>
+  static void TRSM_BATCH(ARGS... args) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasDtrsmBatched(args...));
+  }
+};
+
+template <>
+struct CUBlas<phi::dtype::float16> {
+  using float16 = phi::dtype::float16;
+
+  static void GEMM(mublasHandle_t handle,
+                   mublasOperation_t transa,
+                   mublasOperation_t transb,
+                   int m,
+                   int n,
+                   int k,
+                   const float16 *alpha,
+                   const float16 *A,
+                   int lda,
+                   const float16 *B,
+                   int ldb,
+                   const float16 *beta,
+                   float16 *C,
+                   int ldc) {
+    // PADDLE_ENFORCE_GPU_SUCCESS(
+    //     phi::dynload::mublasHgemm(handle,
+    //                               transa,
+    //                               transb,
+    //                               m,
+    //                               n,
+    //                               k,
+    //                               reinterpret_cast<const __half *>(alpha),
+    //                               reinterpret_cast<const __half *>(A),
+    //                               lda,
+    //                               reinterpret_cast<const __half *>(B),
+    //                               ldb,
+    //                               reinterpret_cast<const __half *>(beta),
+    //                               reinterpret_cast<__half *>(C),
+    //                               ldc));
+     PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasHgemm."));        
+  }
+
+  static void GEMM_BATCH(phi::GPUContext *dev_ctx,
+                         mublasOperation_t transa,
+                         mublasOperation_t transb,
+                         int m,
+                         int n,
+                         int k,
+                         const float *alpha,
+                         const float16 **A,
+                         musaDataType_t Atype,
+                         int lda,
+                         const float16 **B,
+                         musaDataType_t Btype,
+                         int ldb,
+                         const float *beta,
+                         float16 **C,
+                         musaDataType_t Ctype,
+                         int ldc,
+                         int batchCount,
+                         musaDataType_t computeType) {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "mublasGemmBatchedEx is not supported"));
+  }
+
+  static void GEMM_STRIDED_BATCH(mublasHandle_t handle,
+                                 mublasOperation_t transa,
+                                 mublasOperation_t transb,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const float16 *alpha,
+                                 const float16 *A,
+                                 int lda,
+                                 long long int strideA,  // NOLINT
+                                 const float16 *B,       // NOLINT
+                                 int ldb,
+                                 long long int strideB,  // NOLINT
+                                 const float16 *beta,
+                                 float16 *C,
+                                 int ldc,
+                                 long long int strideC,  // NOLINT
+                                 int batchCount) {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "mublasHgemmStridedBatched is not supported"));                                  
+    // PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasHgemmStridedBatched(
+    //     handle,
+    //     transa,
+    //     transb,
+    //     m,
+    //     n,
+    //     k,
+    //     reinterpret_cast<const __half *>(alpha),
+    //     reinterpret_cast<const __half *>(A),
+    //     lda,
+    //     strideA,
+    //     reinterpret_cast<const __half *>(B),
+    //     ldb,
+    //     strideB,
+    //     reinterpret_cast<const __half *>(beta),
+    //     reinterpret_cast<__half *>(C),
+    //     ldc,
+    //     strideC,
+    //     batchCount));
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(phi::GPUContext *dev_ctx,
+                      mublasOperation_t transa,
+                      mublasOperation_t transb,
+                      int m,
+                      int n,
+                      int k,
+                      const void *alpha,
+                      const void *A,
+                      musaDataType_t Atype,
+                      int lda,
+                      const void *B,
+                      musaDataType_t Btype,
+                      int ldb,
+                      const void *beta,
+                      void *C,
+                      musaDataType_t Ctype,
+                      int ldc,
+                      musaDataType_t computeType) {
+    mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
+  }
+};
+
+template <>
+struct CUBlas<phi::dtype::complex<float>> {
+  static void GEMV(mublasHandle_t handle,
+                   mublasOperation_t transa,
+                   int m,
+                   int n,
+                   const phi::dtype::complex<float> *alpha,
+                   const phi::dtype::complex<float> *A,
+                   int lda,
+                   const phi::dtype::complex<float> *B,
+                   int ldb,
+                   const phi::dtype::complex<float> *beta,
+                   phi::dtype::complex<float> *C,
+                   int ldc) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemv(
+        handle,
+        transa,
+        m,
+        n,
+        reinterpret_cast<const muFloatComplex *>(alpha),
+        reinterpret_cast<const muFloatComplex *>(A),
+        lda,
+        reinterpret_cast<const muFloatComplex *>(B),
+        ldb,
+        reinterpret_cast<const muFloatComplex *>(beta),
+        reinterpret_cast<muFloatComplex *>(C),
+        ldc));
+  }
+
+  static void AXPY(mublasHandle_t handle,
+                   int n,
+                   const phi::dtype::complex<float> *alpha,
+                   const phi::dtype::complex<float> *X,
+                   const int incX,
+                   phi::dtype::complex<float> *Y,
+                   const int incY) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCaxpy(
+        handle,
+        n,
+        reinterpret_cast<const muFloatComplex *>(alpha),
+        reinterpret_cast<const muFloatComplex *>(X),
+        incX,
+        reinterpret_cast<muFloatComplex *>(Y),
+        incY));
+  }
+
+  static void GEMM_STRIDED_BATCH(mublasHandle_t handle,
+                                 mublasOperation_t transa,
+                                 mublasOperation_t transb,
+                                 int m,
+                                 int n,
+                                 int k,
+                                 const phi::dtype::complex<float> *alpha,
+                                 const phi::dtype::complex<float> *A,
+                                 int lda,
+                                 long long int strideA,                // NOLINT
+                                 const phi::dtype::complex<float> *B,  // NOLINT
+                                 int ldb,
+                                 long long int strideB,  // NOLINT
+                                 const phi::dtype::complex<float> *beta,
+                                 phi::dtype::complex<float> *C,
+                                 int ldc,
+                                 long long int strideC,  // NOLINT
+                                 int batchCount) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemmStridedBatched(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        reinterpret_cast<const muFloatComplex *>(alpha),
+        reinterpret_cast<const muFloatComplex *>(A),
+        lda,
+        strideA,
+        reinterpret_cast<const muFloatComplex *>(B),
+        ldb,
+        strideB,
+        reinterpret_cast<const muFloatComplex *>(beta),
+        reinterpret_cast<muFloatComplex *>(C),
+        ldc,
+        strideC,
+        batchCount));
+  }
+
+  static void GEMM(mublasHandle_t handle,
+                   mublasOperation_t transa,
+                   mublasOperation_t transb,
+                   int m,
+                   int n,
+                   int k,
+                   const phi::dtype::complex<float> *alpha,
+                   const phi::dtype::complex<float> *A,
+                   int lda,
+                   const phi::dtype::complex<float> *B,
+                   int ldb,
+                   const phi::dtype::complex<float> *beta,
+                   phi::dtype::complex<float> *C,
+                   int ldc) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCgemm(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        reinterpret_cast<const muFloatComplex *>(alpha),
+        reinterpret_cast<const muFloatComplex *>(A),
+        lda,
+        reinterpret_cast<const muFloatComplex *>(B),
+        ldb,
+        reinterpret_cast<const muFloatComplex *>(beta),
+        reinterpret_cast<muFloatComplex *>(C),
+        ldc));
+  }
+
+  static void TRSM(mublasHandle_t handle,
+                   mublasSideMode_t side,
+                   mublasFillMode_t uplo,
+                   mublasOperation_t transa,
+                   mublasDiagType_t diag,
+                   int m,
+                   int n,
+                   const phi::dtype::complex<float> *alpha,
+                   const phi::dtype::complex<float> *A,
+                   int lda,
+                   phi::dtype::complex<float> *B,
+                   int ldb) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsm(
+        handle,
+        side,
+        uplo,
+        transa,
+        diag,
+        m,
+        n,
+        reinterpret_cast<const muFloatComplex *>(alpha),
+        reinterpret_cast<const muFloatComplex *>(A),
+        lda,
+        reinterpret_cast<muFloatComplex *>(B),
+        ldb));
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/muda/mublas/index.html#mublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(phi::GPUContext *dev_ctx,
+                      mublasOperation_t transa,
+                      mublasOperation_t transb,
+                      int m,
+                      int n,
+                      int k,
+                      const void *alpha,
+                      const void *A,
+                      musaDataType_t Atype,
+                      int lda,
+                      const void *B,
+                      musaDataType_t Btype,
+                      int ldb,
+                      const void *beta,
+                      void *C,
+                      musaDataType_t Ctype,
+                      int ldc,
+                      musaDataType_t computeType) {
+    mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
+  }
+
+  static void TRSM_BATCH(mublasHandle_t handle,
+                         mublasSideMode_t side,
+                         mublasFillMode_t uplo,
+                         mublasOperation_t transa,
+                         mublasDiagType_t diag,
+                         int m,
+                         int n,
+                         const phi::dtype::complex<float> *alpha,
+                         const phi::dtype::complex<float> **A,
+                         int lda,
+                         phi::dtype::complex<float> **B,
+                         int ldb,
+                         int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasCtrsmBatched(
+        handle,
+        side,
+        uplo,
+        transa,
+        diag,
+        m,
+        n,
+        reinterpret_cast<const muFloatComplex *>(alpha),
+        reinterpret_cast<const muFloatComplex **>(A),
+        lda,
+        reinterpret_cast<muFloatComplex **>(B),
+        ldb,
+        batch_size));
+  }
+};
+
+template <>
+struct CUBlas<phi::dtype::complex<double>> {
+  static void GEMV(mublasHandle_t handle,
+                   mublasOperation_t transa,
+                   int m,
+                   int n,
+                   const phi::dtype::complex<double> *alpha,
+                   const phi::dtype::complex<double> *A,
+                   int lda,
+                   const phi::dtype::complex<double> *B,
+                   int ldb,
+                   const phi::dtype::complex<double> *beta,
+                   phi::dtype::complex<double> *C,
+                   int ldc) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemv(
+        handle,
+        transa,
+        m,
+        n,
+        reinterpret_cast<const muDoubleComplex *>(alpha),
+        reinterpret_cast<const muDoubleComplex *>(A),
+        lda,
+        reinterpret_cast<const muDoubleComplex *>(B),
+        ldb,
+        reinterpret_cast<const muDoubleComplex *>(beta),
+        reinterpret_cast<muDoubleComplex *>(C),
+        ldc));
+  }
+
+  static void AXPY(mublasHandle_t handle,
+                   int n,
+                   const phi::dtype::complex<double> *alpha,
+                   const phi::dtype::complex<double> *X,
+                   const int incX,
+                   phi::dtype::complex<double> *Y,
+                   const int incY) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZaxpy(
+        handle,
+        n,
+        reinterpret_cast<const muDoubleComplex *>(alpha),
+        reinterpret_cast<const muDoubleComplex *>(X),
+        incX,
+        reinterpret_cast<muDoubleComplex *>(Y),
+        incY));
+  }
+
+  static void GEMM_STRIDED_BATCH(
+      mublasHandle_t handle,
+      mublasOperation_t transa,
+      mublasOperation_t transb,
+      int m,
+      int n,
+      int k,
+      const phi::dtype::complex<double> *alpha,
+      const phi::dtype::complex<double> *A,
+      int lda,
+      long long int strideA,                 // NOLINT
+      const phi::dtype::complex<double> *B,  // NOLINT
+      int ldb,
+      long long int strideB,  // NOLINT
+      const phi::dtype::complex<double> *beta,
+      phi::dtype::complex<double> *C,
+      int ldc,
+      long long int strideC,  // NOLINT
+      int batchCount) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemmStridedBatched(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        reinterpret_cast<const muDoubleComplex *>(alpha),
+        reinterpret_cast<const muDoubleComplex *>(A),
+        lda,
+        strideA,
+        reinterpret_cast<const muDoubleComplex *>(B),
+        ldb,
+        strideB,
+        reinterpret_cast<const muDoubleComplex *>(beta),
+        reinterpret_cast<muDoubleComplex *>(C),
+        ldc,
+        strideC,
+        batchCount));
+  }
+
+  static void GEMM(mublasHandle_t handle,
+                   mublasOperation_t transa,
+                   mublasOperation_t transb,
+                   int m,
+                   int n,
+                   int k,
+                   const phi::dtype::complex<double> *alpha,
+                   const phi::dtype::complex<double> *A,
+                   int lda,
+                   const phi::dtype::complex<double> *B,
+                   int ldb,
+                   const phi::dtype::complex<double> *beta,
+                   phi::dtype::complex<double> *C,
+                   int ldc) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZgemm(
+        handle,
+        transa,
+        transb,
+        m,
+        n,
+        k,
+        reinterpret_cast<const muDoubleComplex *>(alpha),
+        reinterpret_cast<const muDoubleComplex *>(A),
+        lda,
+        reinterpret_cast<const muDoubleComplex *>(B),
+        ldb,
+        reinterpret_cast<const muDoubleComplex *>(beta),
+        reinterpret_cast<muDoubleComplex *>(C),
+        ldc));
+  }
+
+  static void TRSM(mublasHandle_t handle,
+                   mublasSideMode_t side,
+                   mublasFillMode_t uplo,
+                   mublasOperation_t transa,
+                   mublasDiagType_t diag,
+                   int m,
+                   int n,
+                   const phi::dtype::complex<double> *alpha,
+                   const phi::dtype::complex<double> *A,
+                   int lda,
+                   phi::dtype::complex<double> *B,
+                   int ldb) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsm(
+        handle,
+        side,
+        uplo,
+        transa,
+        diag,
+        m,
+        n,
+        reinterpret_cast<const muDoubleComplex *>(alpha),
+        reinterpret_cast<const muDoubleComplex *>(A),
+        lda,
+        reinterpret_cast<muDoubleComplex *>(B),
+        ldb));
+  }
+
+  static void TRSM_BATCH(mublasHandle_t handle,
+                         mublasSideMode_t side,
+                         mublasFillMode_t uplo,
+                         mublasOperation_t transa,
+                         mublasDiagType_t diag,
+                         int m,
+                         int n,
+                         const phi::dtype::complex<double> *alpha,
+                         const phi::dtype::complex<double> **A,
+                         int lda,
+                         phi::dtype::complex<double> **B,
+                         int ldb,
+                         int batch_size) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasZtrsmBatched(
+        handle,
+        side,
+        uplo,
+        transa,
+        diag,
+        m,
+        n,
+        reinterpret_cast<const muDoubleComplex *>(alpha),
+        reinterpret_cast<const muDoubleComplex **>(A),
+        lda,
+        reinterpret_cast<muDoubleComplex **>(B),
+        ldb,
+        batch_size));
+  }
+
+  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
+  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
+  template <typename... ARGS>
+  static void GEMM_EX(phi::GPUContext *dev_ctx,
+                      mublasOperation_t transa,
+                      mublasOperation_t transb,
+                      int m,
+                      int n,
+                      int k,
+                      const void *alpha,
+                      const void *A,
+                      musaDataType_t Atype,
+                      int lda,
+                      const void *B,
+                      musaDataType_t Btype,
+                      int ldb,
+                      const void *beta,
+                      void *C,
+                      musaDataType_t Ctype,
+                      int ldc,
+                      musaDataType_t computeType) {
+    mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
+    if (use_tensor_op_math) {
+      algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
+    }
+    VLOG(5) << "use_tensor_op_math: "
+            << (use_tensor_op_math ? "True" : "False");
+
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
+
+  }
+};
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                 CBLAS_TRANSPOSE transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 T alpha,
+                                 const T *A,
+                                 const T *B,
+                                 T beta,
+                                 T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  context_.CublasCall([&](mublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle,
+                      cuTransB,
+                      cuTransA,
+                      N,
+                      M,
+                      K,
+                      &alpha,
+                      B,
+                      ldb,
+                      A,
+                      lda,
+                      &beta,
+                      C,
+                      N);
+  });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C) const {
+  // // Note that cublas follows fortran order, so the order is different from
+  // // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
+  CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
+                                       cuTransB,
+                                       cuTransA,
+                                       N,
+                                       M,
+                                       K,
+                                       &h_alpha,
+                                       B,
+                                       MUSA_R_16F,
+                                       ldb,
+                                       A,
+                                       MUSA_R_16F,
+                                       lda,
+                                       &h_beta,
+                                       C,
+                                       MUSA_R_16F,
+                                       N,
+                                       (musaDataType_t)0);//MUSA_R_32F https://jira.mthreads.com/browse/SW-37038
+}
+
+
+
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {
+                                            PADDLE_THROW(phi::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::complex<float> alpha,
+                                        const phi::dtype::complex<float> *A,
+                                        const phi::dtype::complex<float> *B,
+                                        phi::dtype::complex<float> beta,
+                                        phi::dtype::complex<float> *C) const {  
+                                          PADDLE_THROW(phi::errors::Unimplemented(
+      "Blas::GEMM for dtype complex<double> is not supported on MUSA now!"));
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::complex<double> alpha,
+                                        const phi::dtype::complex<double> *A,
+                                        const phi::dtype::complex<double> *B,
+                                        phi::dtype::complex<double> beta,
+                                        phi::dtype::complex<double> *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+
+  thrust::complex<double> c_alpha =
+      thrust::complex<double>(alpha.real, alpha.imag);
+  thrust::complex<double> c_beta =
+      thrust::complex<double>(beta.real, beta.imag);
+  auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
+  CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
+                                               cuTransB,
+                                               cuTransA,
+                                               N,
+                                               M,
+                                               K,
+                                               &c_alpha,
+                                               B,
+                                               // Originally, this was MUSA_C_64F, but due to some bugs, it was necessary to manually specify a value
+                                               // jira:https://jira.mthreads.com/browse/SW-37038
+                                               (musaDataType_t)5,//MUSA_C_64F
+                                               ldb,
+                                               A,
+                                               (musaDataType_t)5,//MUSA_C_64F
+                                               lda,
+                                               &c_beta,
+                                               C,
+                                               (musaDataType_t)5,//MUSA_C_64F
+                                               N,
+                                               (musaDataType_t)5);//MUSA_C_64F
+}
+
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMM(bool transA,
+                                 bool transB,
+                                 int M,
+                                 int N,
+                                 int K,
+                                 T alpha,
+                                 const T *A,
+                                 int lda,
+                                 const T *B,
+                                 int ldb,
+                                 T beta,
+                                 T *C,
+                                 int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N;
+  mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N;
+
+  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
+    auto &cuda_ctx = const_cast<phi::GPUContext &>(context_);
+    CUBlas<T>::GEMM_EX(&cuda_ctx,
+                       cuTransB,
+                       cuTransA,
+                       N,
+                       M,
+                       K,
+                       &alpha,
+                       B,
+                       (musaDataType_t)0,//MUSA_R_32F,
+                       ldb,
+                       A,
+                       (musaDataType_t)0,//MUSA_R_32F,
+                       lda,
+                       &beta,
+                       C,
+                       (musaDataType_t)0,//MUSA_R_32F,
+                       ldc);
+  } else {
+    context_.CublasCall([&](mublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle,
+                      cuTransB,
+                      cuTransA,
+                      N,
+                      M,
+                      K,
+                      &alpha,
+                      B,
+                      ldb,
+                      A,
+                      lda,
+                      &beta,
+                      C,
+                      ldc);
+    });
+  }
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        int lda,
+                                        const phi::dtype::float16 *B,
+                                        int ldb,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C,
+                                        int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  mublasOperation_t cuTransA = transA ? MUBLAS_OP_T : MUBLAS_OP_N;
+  mublasOperation_t cuTransB = transB ? MUBLAS_OP_T : MUBLAS_OP_N;
+
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &alpha,
+                                      B,
+                                      ldb,
+                                      A,
+                                      lda,
+                                      &beta,
+                                      C,
+                                      ldc);
+  });
+}
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMM(bool transA,
+                                        bool transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        int lda,
+                                        const phi::dtype::bfloat16 *B,
+                                        int ldb,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C,
+                                        int ldc) const {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Blas::GEMM for dtype bfloat16 is not supported on MUSA now!"));
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
+  context_.CublasCall(
+      [&](mublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
+  context_.CublasCall(
+      [&](mublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                 int M,
+                                 int N,
+                                 T alpha,
+                                 const T *A,
+                                 const T *B,
+                                 T beta,
+                                 T *C) const {
+  mublasOperation_t cuTransA = !trans_a ? MUBLAS_OP_T : MUBLAS_OP_N;
+
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::float16 alpha,
+                                        const phi::dtype::float16 *A,
+                                        const phi::dtype::float16 *B,
+                                        phi::dtype::float16 beta,
+                                        phi::dtype::float16 *C) const {
+  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
+  if (trans_a) {
+    this->template GEMM<phi::dtype::float16>(
+        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<phi::dtype::float16>(
+        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
+  }
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
+                                        int M,
+                                        int N,
+                                        phi::dtype::bfloat16 alpha,
+                                        const phi::dtype::bfloat16 *A,
+                                        const phi::dtype::bfloat16 *B,
+                                        phi::dtype::bfloat16 beta,
+                                        phi::dtype::bfloat16 *C) const {
+  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
+  // it.
+  if (trans_a) {
+    this->template GEMM<phi::dtype::bfloat16>(
+        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<phi::dtype::bfloat16>(
+        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
+  }
+}
+
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T *A,
+                                        const T *B,
+                                        T beta,
+                                        T *C,
+                                        int batchCount,
+                                        int64_t strideA,
+                                        int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  const int64_t strideC = M * N;
+    context_.CublasCall([&](mublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    N,
+                                    M,
+                                    K,
+                                    &alpha,
+                                    B,
+                                    ldb,
+                                    strideB,
+                                    A,
+                                    lda,
+                                    strideA,
+                                    &beta,
+                                    C,
+                                    ldc,
+                                    strideC,
+                                    batchCount);
+    });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::bfloat16 alpha,
+                                               const phi::dtype::bfloat16 *A,
+                                               const phi::dtype::bfloat16 *B,
+                                               phi::dtype::bfloat16 beta,
+                                               phi::dtype::bfloat16 *C,
+                                               int batchCount,
+                                               int64_t strideA,
+                                               int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  // int lda = (transA == CblasNoTrans) ? K : M;
+  // int ldb = (transB == CblasNoTrans) ? N : K;
+  // int ldc = N;
+  // mublasOperation_t cuTransA =
+  //     (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  // mublasOperation_t cuTransB =
+  //     (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  // const int64_t strideC = M * N;
+
+  // float h_alpha = static_cast<float>(alpha);
+  // float h_beta = static_cast<float>(beta);
+
+  // mublasGemmAlgo_t algo = MUBLAS_GEMM_DEFAULT;
+  // bool use_tensor_op_math = context_.tensor_core_available();
+  // if (use_tensor_op_math) {
+  //   algo = MUBLAS_GEMM_DEFAULT_TENSOR_OP;
+  // }
+  // VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  // context_.TensorCoreCublasCallIfAvailable([&](mublasHandle_t handle) {
+  //   PADDLE_ENFORCE_GPU_SUCCESS(
+  //       phi::dynload::mublasGemmStridedBatchedEx(handle,
+  //                                                cuTransB,
+  //                                                cuTransA,
+  //                                                N,
+  //                                                M,
+  //                                                K,
+  //                                                &h_alpha,
+  //                                                B,
+  //                                                MUSA_R_16BF,
+  //                                                ldb,
+  //                                                strideB,
+  //                                                A,
+  //                                                MUSA_R_16BF,
+  //                                                lda,
+  //                                                strideA,
+  //                                                &h_beta,
+  //                                                C,
+  //                                                MUSA_R_16BF,
+  //                                                ldc,
+  //                                                strideC,
+  //                                                batchCount,
+  //                                                MUBLAS_COMPUTE_32F,
+  //                                                algo));
+  // });
+       PADDLE_THROW(
+        phi::errors::Unimplemented("murrently there are not mublasGemmStridedBatchedEx."));   
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                        CBLAS_TRANSPOSE transB,
+                                        int M,
+                                        int N,
+                                        int K,
+                                        T alpha,
+                                        const T **A,
+                                        const T **B,
+                                        T beta,
+                                        T **C,
+                                        int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<T>(
+        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
+  }
+}
+
+#if defined(__MUSACC__)
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               double alpha,
+                                               const double **A,
+                                               const double **B,
+                                               double beta,
+                                               double **C,
+                                               int batchCount) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  thrust::device_vector<const double *> A_ptr(A, A + batchCount);
+  thrust::device_vector<const double *> B_ptr(B, B + batchCount);
+  thrust::device_vector<double *> C_ptr(C, C + batchCount);
+
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<double>::GEMM_BATCH(handle,
+                               cuTransB,
+                               cuTransA,
+                               N,
+                               M,
+                               K,
+                               &alpha,
+                               B_ptr.data().get(),
+                               ldb,
+                               A_ptr.data().get(),
+                               lda,
+                               &beta,
+                               C_ptr.data().get(),
+                               ldc,
+                               batchCount);
+  });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               float alpha,
+                                               const float **A,
+                                               const float **B,
+                                               float beta,
+                                               float **C,
+                                               int batchCount) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  thrust::device_vector<const float *> A_ptr(A, A + batchCount);
+  thrust::device_vector<const float *> B_ptr(B, B + batchCount);
+  thrust::device_vector<float *> C_ptr(C, C + batchCount);
+
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<float>::GEMM_BATCH(handle,
+                              cuTransB,
+                              cuTransA,
+                              N,
+                              M,
+                              K,
+                              &alpha,
+                              B_ptr.data().get(),
+                              ldb,
+                              A_ptr.data().get(),
+                              lda,
+                              &beta,
+                              C_ptr.data().get(),
+                              ldc,
+                              batchCount);
+  });
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::float16 alpha,
+                                               const phi::dtype::float16 **A,
+                                               const phi::dtype::float16 **B,
+                                               phi::dtype::float16 beta,
+                                               phi::dtype::float16 **C,
+                                               int batchCount) const {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Blas::BatchedGEMM for dtype float16 is not supported on MUSA now!"));
+}
+
+template <>
+template <>
+inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                               CBLAS_TRANSPOSE transB,
+                                               int M,
+                                               int N,
+                                               int K,
+                                               phi::dtype::bfloat16 alpha,
+                                               const phi::dtype::bfloat16 **A,
+                                               const phi::dtype::bfloat16 **B,
+                                               phi::dtype::bfloat16 beta,
+                                               phi::dtype::bfloat16 **C,
+                                               int batchCount) const {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Blas::BatchedGEMM for bfloat16 is not supported on MUSA now!"));
+}
+#endif
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
+                                 CBLAS_UPLO uplo,
+                                 CBLAS_TRANSPOSE transA,
+                                 CBLAS_DIAG diag,
+                                 int M,
+                                 int N,
+                                 T alpha,
+                                 const T *A,
+                                 int lda,
+                                 T *B,
+                                 int ldb) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  mublasSideMode_t cuSide =
+      (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT;
+  mublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::TRSM(
+        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRF(
+    int n, T **a, int *ipiv, int *info, int batch_size) const {
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRI(int n,
+                                         const T **a,
+                                         const int *ipiv,
+                                         T **a_inv,
+                                         int *info,
+                                         int batch_size) const {
+  PADDLE_ENFORCE_NE(
+      a_inv,
+      a,
+      phi::errors::InvalidArgument(
+          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
+          "in-place. The memory space of output matrix (address: %p) cannot "
+          "overlap memory space of input matrix (address: %p).",
+          a_inv,
+          a));
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedMatInv(
+    int n, const T **a, T **a_inv, int *info, int batch_size) const {
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
+                                         int n,
+                                         int nrhs,
+                                         const T **a,
+                                         int lda,
+                                         int *ipiv,
+                                         T **b,
+                                         int ldb,
+                                         int *info,
+                                         int batch_size) const {
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  mublasOperation_t cuTrans =
+      (trans == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(
+        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
+  });
+}
+
+template <>
+template <typename T>
+void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
+                                        CBLAS_UPLO uplo,
+                                        CBLAS_TRANSPOSE transA,
+                                        CBLAS_DIAG diag,
+                                        int M,
+                                        int N,
+                                        T alpha,
+                                        const T **A,
+                                        int lda,
+                                        T **B,
+                                        int ldb,
+                                        int batch_size) const {
+  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
+  // where ' stands for transpose
+  mublasSideMode_t cuSide =
+      (side == CblasLeft) ? MUBLAS_SIDE_RIGHT : MUBLAS_SIDE_LEFT;
+  mublasFillMode_t cuUplo =
+      (uplo == CblasLower) ? MUBLAS_FILL_MODE_UPPER : MUBLAS_FILL_MODE_LOWER;
+  // use CUBLAS_OP_C (conjugate transpose) for complex
+  mublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? MUBLAS_OP_N : MUBLAS_OP_T;
+  mublasDiagType_t cuDiag =
+      (diag == CblasUnit) ? MUBLAS_DIAG_UNIT : MUBLAS_DIAG_NON_UNIT;
+
+  context_.CublasCall([&](mublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle,
+                          cuSide,
+                          cuUplo,
+                          cuTransA,
+                          cuDiag,
+                          N,
+                          M,
+                          &alpha,
+                          A,
+                          lda,
+                          B,
+                          ldb,
+                          batch_size);
+  });
+}
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 822801e10c357c..c25ab4b55cb53d 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <sstream>
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 #include "paddle/phi/kernels/funcs/dims_simplifier.h"
 
 namespace kps = phi::kps;
@@ -27,7 +27,7 @@ namespace kps = phi::kps;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 
 enum BroadcastType { kMixed = 1, kBroadcast = 2, kElementwise = 3 };
 
diff --git a/paddle/phi/kernels/funcs/check_numerics_utils.h b/paddle/phi/kernels/funcs/check_numerics_utils.h
index 76adc40c4f9f95..6d426d764e2214 100644
--- a/paddle/phi/kernels/funcs/check_numerics_utils.h
+++ b/paddle/phi/kernels/funcs/check_numerics_utils.h
@@ -86,7 +86,7 @@ HOSTDEVICE static void PrintAndThrowError(const char* debug_info,
                                           int64_t num_nan,
                                           int64_t num_inf,
                                           int64_t num_zero) {
-#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__)
+#if !defined(__HIPCC__) && !defined(__CUDA_ARCH__) && !defined(__MUSACC__)
   PADDLE_THROW(phi::errors::PreconditionNotMet(
       "There are NAN or INF (num_nan=%lld, num_inf=%lld, num_zero=%lld) in "
       "%s.",
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index f2b7de681bcfce..877bd056ac5426 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -49,7 +49,7 @@ static inline void GetBlockDims(const phi::GPUContext& context,
   *grid_dims = dim3(grid_cols, grid_rows, 1);
 }
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
diff --git a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
index e6d587a61e11a7..2d210f32009370 100644
--- a/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_cpu_kernel.h
@@ -29,7 +29,7 @@ template <typename T,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = phi::EigenVector<T, MajorType, IndexType>;
 
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group for GRU CPU
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___)  // @{ Group for GRU CPU
 template <class OpResetOutput, typename T>
 void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
                                        T *gate_value,
diff --git a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
index b491cbe120d06f..d0f714831549bc 100644
--- a/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h
@@ -144,7 +144,7 @@ __global__ void KeFastCollectiveGruGate(T *gate_value,
       }
 
       for (int i = 0; i < Tiled_size; ++i) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__)
         c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
 #else
         c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
@@ -206,7 +206,7 @@ __global__ void KeFastCollectiveGruOut(const T *gate_weight,
       }
 
       for (int i = 0; i < Tiled_size; ++i) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700) || defined(__MUSACC__)
         c0 = c0 + __shfl_sync(Tiled_mask, a0, i, Tiled_size) * b0[i];
 #else
         c0 = c0 + __shfl(a0, i, Tiled_size) * b0[i];
diff --git a/paddle/phi/kernels/funcs/detail/gru_kernel.h b/paddle/phi/kernels/funcs/detail/gru_kernel.h
index 9e2aef19406191..f5a16ade4fd23d 100644
--- a/paddle/phi/kernels/funcs/detail/gru_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/gru_kernel.h
@@ -44,7 +44,7 @@ class gru_resetOutput {
           (*value_reset_output + *value_reset_bias) * (*value_reset_gate);
     }
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU reset output
+#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU reset output
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -90,7 +90,7 @@ class gru_finalOutput {
                       ((*value_update_gate) * (*value_frame_state));
     }
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU final output
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC___)// @{ Group GRU final output
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -150,7 +150,7 @@ class gru_stateGrad {
           *grad_output * (*value_update_gate), *value_frame_state, act_input);
     }
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU state grad
+#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU state grad
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -211,7 +211,7 @@ class gru_resetGrad {
     *grad_reset_gate =
         activation(*grad_reset_gate, *value_reset_gate, act_gate);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU reset grad
+#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU reset grad
 #ifndef __AVX__
   static const bool avx = false;
 #else
@@ -265,7 +265,7 @@ class gru {
         reset_output * (*grad_frame_state), *value_reset_gate, act_gate);
     *grad_reset_output = (*value_reset_gate) * (*grad_frame_state);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group GRU CPU
+#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group GRU CPU
 #ifndef __AVX__
   static const bool avx = false;
 #else
diff --git a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
index e8b8e957c80d1c..b0702d560fa518 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_cpu_kernel.h
@@ -36,7 +36,7 @@ template <typename T,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = phi::EigenVector<T, MajorType, IndexType>;
 
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM CPU
+#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group LSTM CPU
 
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op,
diff --git a/paddle/phi/kernels/funcs/detail/lstm_kernel.h b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
index 0846f05a0c2c53..264322521d477f 100644
--- a/paddle/phi/kernels/funcs/detail/lstm_kernel.h
+++ b/paddle/phi/kernels/funcs/detail/lstm_kernel.h
@@ -59,7 +59,7 @@ class lstm {
     *state_atv = activation(*state, active_state);
     *output = (*value_og) * (*state_atv);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM FWD
+#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group LSTM FWD
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
   static const bool avx = false;
 #else
@@ -163,7 +163,7 @@ class lstm {
     *checkFGrad = (*grad_fg) * (*prev_state);
     *checkOGrad = (*grad_og) * (*state);
   }
-#if !defined(__NVCC__) && !defined(__HIPCC___)  // @{ Group LSTM BWD
+#if !defined(__NVCC__) && !defined(__HIPCC___)  && !defined(__MUSACC___)  // @{ Group LSTM BWD
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
   static const bool avx = false;
 #else
diff --git a/paddle/phi/kernels/funcs/detail/strided_memcpy.h b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
index 03e3bdde05ad09..555b1d3fb250e0 100644
--- a/paddle/phi/kernels/funcs/detail/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/detail/strided_memcpy.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/device_context.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 
@@ -41,7 +41,7 @@ struct StridedMemcpyFunctor<T, 0> {
       auto& cpu_place = place;
       memory_utils::Copy(cpu_place, dst, cpu_place, src, sizeof(T));
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(
@@ -68,7 +68,7 @@ struct StridedMemcpyFunctor<T, 1> {
       memory_utils::Copy(
           cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]);
     } else {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       auto& gpu_place = place;
       auto& cuda_ctx = reinterpret_cast<const phi::GPUContext&>(dev_ctx);
       memory_utils::Copy(gpu_place,
diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h
index 5504a337e88f2e..6f4e5fceec4739 100644
--- a/paddle/phi/kernels/funcs/diagonal.h
+++ b/paddle/phi/kernels/funcs/diagonal.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 
@@ -109,7 +109,7 @@ DenseTensor Diagonal(const DeviceContext& context,
 
     int64_t pos = std::abs(offset) * offset_stride;
     int64_t dim_size = ret_strides.size();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     thrust::device_vector<int64_t> diag_vec(common::vectorize(dig_stride));
     const int64_t* diag_arr = thrust::raw_pointer_cast(diag_vec.data());
     thrust::device_vector<int64_t> ret_vec(ret_strides);
@@ -146,7 +146,7 @@ std::vector<T> ComputeDimStride(const std::vector<T> dim) {
   return dim_strides;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T, int X_DIM_SIZE, int OUT_DIM_SIZE>
 __global__ void DiagonalCuda(const T* data1,
                              T* data2,
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
index abade7ac0ef877..f9c6a0934dc6a2 100644
--- a/paddle/phi/kernels/funcs/distribution_helper.h
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -21,6 +21,10 @@ limitations under the License. */
 #include <hiprand_kernel.h>
 #endif
 
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/common/amp_type_traits.h"
@@ -28,7 +32,7 @@ limitations under the License. */
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 #endif
@@ -49,7 +53,7 @@ struct exponential_transform {
   explicit exponential_transform(T lambda) : lambda_(lambda) {}
 
   HOSTDEVICE inline T operator()(T val) const {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     T log = -std::numeric_limits<T>::epsilon() / 2;
     if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
       if (std::is_same<T, double>::value) {
@@ -113,7 +117,7 @@ struct normal_transform {
   T std_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 
 namespace kps = phi::kps;
 
@@ -122,19 +126,19 @@ namespace kps = phi::kps;
 template <typename T>
 struct normal_distribution;
 
-#if defined(__NVCC__)
+#if defined(__MUSACC__)
 template <typename T>
 struct uniform_distribution {
-  __device__ inline T operator()(curandStatePhilox4_32_10_t *state) const {
-    return static_cast<T>(curand_uniform(state));
+  __device__ inline T operator()(murandStatePhilox4_32_10_t *state) const {
+    return static_cast<T>(murand_uniform(state));
   }
   static constexpr int kReturnsCount = 1;
 };
 
 template <>
 struct uniform_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform4(state);
+  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+    return murand_uniform4(state);
   }
   static constexpr int kReturnsCount = 4;
 };
@@ -142,16 +146,16 @@ struct uniform_distribution<float> {
 template <>
 struct uniform_distribution<double> {
   __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_uniform2_double(state);
+      murandStatePhilox4_32_10_t *state) const {
+    return murand_uniform2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };
 
 template <>
 struct uniform_distribution<uint32_t> {
-  __device__ inline uint4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand4(state);
+  __device__ inline uint4 operator()(murandStatePhilox4_32_10_t *state) const {
+    return murand4(state);
   }
   static constexpr int kReturnsCount = 4;
 };
@@ -159,9 +163,9 @@ struct uniform_distribution<uint32_t> {
 template <>
 struct uniform_distribution<uint64_t> {
   __device__ inline ulonglong2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
+      murandStatePhilox4_32_10_t *state) const {
     ulonglong2 result;
-    uint4 rand = curand4(state);
+    uint4 rand = murand4(state);
     result.x = (uint64_t)rand.x << 32 | rand.y;
     result.y = (uint64_t)rand.z << 32 | rand.w;
     return result;
@@ -171,8 +175,8 @@ struct uniform_distribution<uint64_t> {
 
 template <>
 struct normal_distribution<float> {
-  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
-    return curand_normal4(state);
+  __device__ inline float4 operator()(murandStatePhilox4_32_10_t *state) const {
+    return murand_normal4(state);
   }
   static constexpr int kReturnsCount = 4;
 };
@@ -180,8 +184,8 @@ struct normal_distribution<float> {
 template <>
 struct normal_distribution<double> {
   __device__ inline double2 operator()(
-      curandStatePhilox4_32_10_t *state) const {
-    return curand_normal2_double(state);
+      murandStatePhilox4_32_10_t *state) const {
+    return murand_normal2_double(state);
   }
   static constexpr int kReturnsCount = 2;
 };
@@ -264,10 +268,10 @@ __global__ void DistributionKernel(size_t size,
                                    size_t stride) {
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
   static constexpr int kCount = DistOp::kReturnsCount;
-#if defined(__NVCC__)
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx + THREAD_ID_X, offset, &state);
-  using SType = curandStatePhilox4_32_10_t;
+#if defined(__MUSACC__)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = murandStatePhilox4_32_10_t;
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 985c028afb2a88..87283549f8e294 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -20,6 +20,12 @@ limitations under the License. */
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <murand_kernel.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
@@ -146,6 +152,10 @@ __global__ void VectorizedRandomGenerator(
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -216,6 +226,10 @@ __global__ void VectorizedGeneratorMask(const size_t n,
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -288,6 +302,11 @@ void DropoutFwGPUKernelDriver(
           hipMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(mask_data, 0, x_numel * sizeof(*mask_data), stream));
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream));
@@ -349,7 +368,7 @@ void DropoutFwGPUKernelDriver(
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       VectorizedRandomGenerator<T>
           <<<grid_size, block_size, 0, stream>>>(0,
                                                  size,
@@ -449,6 +468,8 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
     if (upscale_in_train && dropout_prob == 1.0f) {
 #ifdef PADDLE_WITH_HIP
       hipMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #else
       cudaMemset(grad_x->data<T>(), 0, grad_x->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index c92acdaf4180be..5b2657704367e5 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__) || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/function_traits.h"
@@ -150,7 +150,7 @@ class MidWiseTransformIterator<T, CPUContext>
   int64_t post_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T>
 class RowwiseTransformIterator<T, GPUContext>
     : public thrust::iterator_adaptor<RowwiseTransformIterator<T, GPUContext>,
@@ -485,7 +485,7 @@ inline void ElementwiseGradPreProcess(const DenseTensor &dout,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 
 // static unroller
 template <template <int Index, int VecSize> typename Func,
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index eaf527fbba9f6b..9a50c15c101949 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -845,7 +845,7 @@ struct InverseFloorDivideFunctor<dtype::bfloat16> {
   }
 };
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T, typename MPType>
 inline HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow(const T a, const T b) {
diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h
index 8e5e45b861a3ae..752a2959bea8cc 100644
--- a/paddle/phi/kernels/funcs/elementwise_grad_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h
@@ -25,7 +25,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
@@ -406,7 +406,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx,
       dy == nullptr ? nullptr : dev_ctx.template Alloc<T>(dy)});
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 // Suppose only has contiguous dims
 static inline bool CheckContiguousDims(const std::vector<int> &broadcast_pos) {
   for (int i = 1; i < broadcast_pos.size(); ++i) {
diff --git a/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
index 5d4611fa9d09a9..032a4dbeb6ce3c 100644
--- a/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
+++ b/paddle/phi/kernels/funcs/emb_eltwise_layer_norm_functor.cu
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -127,7 +132,7 @@ __global__ void EmbEltwiseLayernormKernel(int hidden,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifndef __HIPCC__  // @{ Half kernel: EmbEltwiseLayernormKernel
+#ifdef __CUDACC__  // @{ Half kernel: EmbEltwiseLayernormKernel
 template <>
 __global__ void EmbEltwiseLayernormKernel<half, 256>(int hidden,
                                                      const int64_t* ids,
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index 107759313069d3..17b5b6d80cd28f 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -89,7 +89,7 @@ __global__ void InplaceAddReluKernel(const int N, const T* bias, T* data) {
 
   for (int i = threadIdx.x; i < N; i += BlockDim) {
     T temp;
-#if defined(__HIPCC__) || __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__)  || defined(__MUSACC__) || __CUDA_ARCH__ >= 350
     temp = __ldg(data + offset + i) + __ldg(bias + i);
 #else
     temp = data[offset + i] + bias[i];
@@ -192,7 +192,7 @@ __global__ void InplaceAddReluKernel(const int N,
   int offset = blockIdx.x * N;
   for (int i = threadIdx.x; i < N; i += BlockDim) {
     half temp;
-#if defined(__HIPCC__) || __CUDA_ARCH__ >= 350
+#if defined(__HIPCC__)  || defined(__MUSACC__) || __CUDA_ARCH__ >= 350
     temp = __hadd(__ldg(data + offset + i), __ldg(bias + i));
 #else
     temp = __hadd(data[offset + i], bias[i]);
@@ -373,7 +373,7 @@ template class FCFunctor<GPUContext, float16>;
 template class FCFunctor<GPUContext, float>;
 template class FCFunctor<GPUContext, double>;
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 template <typename DeviceContext, typename T>
 void FCInt8Functor<DeviceContext, T>::operator()(
     const DeviceContext& context,
diff --git a/paddle/phi/kernels/funcs/fft.cu b/paddle/phi/kernels/funcs/fft.cu
index c70f615e80fa4d..dfcd7fad0bb943 100644
--- a/paddle/phi/kernels/funcs/fft.cu
+++ b/paddle/phi/kernels/funcs/fft.cu
@@ -102,7 +102,7 @@ inline bool use_cache(const int64_t* signal_size) {
   }
   return using_cache;
 }
-#elif defined(PADDLE_WITH_HIP)
+#elif defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 inline bool use_cache(const int64_t* signal_size) { return true; }
 #endif
 
@@ -198,6 +198,11 @@ void exec_fft(const phi::GPUContext& ctx,
       phi::dynload::hipfftSetStream(config->plan(), ctx.stream()));
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::hipfftSetWorkArea(config->plan(), workspace_tensor.data()));
+#elif defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetStream(config->plan(), ctx.stream()));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mufftSetWorkArea(config->plan(), workspace_tensor.data()));
 #endif
 
   // execution of fft plan
diff --git a/paddle/phi/kernels/funcs/fft_cache.h b/paddle/phi/kernels/funcs/fft_cache.h
index 51e90a6c0d95b5..a6f775af88ea7e 100644
--- a/paddle/phi/kernels/funcs/fft_cache.h
+++ b/paddle/phi/kernels/funcs/fft_cache.h
@@ -25,6 +25,8 @@
 #include "paddle/phi/kernels/funcs/cufft_util.h"
 #elif defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/funcs/hipfft_util.h"
+#elif defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/funcs/mufft_util.h"
 #endif
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/fft_fill_conj.h b/paddle/phi/kernels/funcs/fft_fill_conj.h
index ab6d351986ecc2..e89584d5c92f61 100644
--- a/paddle/phi/kernels/funcs/fft_fill_conj.h
+++ b/paddle/phi/kernels/funcs/fft_fill_conj.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "thrust/device_vector.h"
 #endif
 
@@ -156,7 +156,7 @@ void FFTFillConj(const DeviceContext& ctx,
     _is_fft_axis[i] = true;
   }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   const thrust::device_vector<int64_t> src_strides_g(src_strides_v);
   const auto src_strides = thrust::raw_pointer_cast(src_strides_g.data());
   const thrust::device_vector<int64_t> dst_strides_g(dst_strides_v);
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index 7b6f672f47f1b5..8bc16a1f32102e 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -42,7 +42,7 @@ struct ForRange<phi::CPUContext> {
   size_t limit_;
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 
 template <typename Function>
 __global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc
index f0c946134906b0..184016808d8cd0 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cc
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -27,7 +27,7 @@ struct GRUUnitFunctor<phi::CPUContext, T> {
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     auto blas = phi::funcs::GetBlas<phi::CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(false,
@@ -93,7 +93,7 @@ struct GRUUnitGradFunctor<phi::CPUContext, T> {
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate,
                       bool origin_mode) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     detail::backward_state_grad(
         phi::funcs::detail::backward::gru_stateGrad<T>(),
         value,
@@ -185,7 +185,7 @@ struct GRUUnitFunctorV2<CPUContext, T> {
                       int batch_size,
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     auto blas = phi::funcs::GetBlas<CPUContext, T>(context);
     if (value.prev_out_value) {
       blas.GEMM(CblasNoTrans,
@@ -239,7 +239,7 @@ struct GRUUnitGradFunctorV2<CPUContext, T> {
                       int batch_size,
                       const phi::funcs::detail::ActivationType active_node,
                       const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
+#if !defined(__NVCC__) && !defined(__HIPCC___) && !defined(__MUSACC__)
     // calculate grad_update_gate, grad_frame_state,
     // grad_reset_output, grad_reset_gate
     detail::cpu_gru_backward(context,
diff --git a/paddle/phi/kernels/funcs/inclusive_scan.h b/paddle/phi/kernels/funcs/inclusive_scan.h
index 265febd306f334..cd0b223d8a0dbc 100644
--- a/paddle/phi/kernels/funcs/inclusive_scan.h
+++ b/paddle/phi/kernels/funcs/inclusive_scan.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/index_calculator.h b/paddle/phi/kernels/funcs/index_calculator.h
index 4e306cb87a480d..4611173b794a2c 100644
--- a/paddle/phi/kernels/funcs/index_calculator.h
+++ b/paddle/phi/kernels/funcs/index_calculator.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/funcs/index_put_utils.h b/paddle/phi/kernels/funcs/index_put_utils.h
index 983d33bedc72ca..85aabb2adf3cd2 100644
--- a/paddle/phi/kernels/funcs/index_put_utils.h
+++ b/paddle/phi/kernels/funcs/index_put_utils.h
@@ -27,12 +27,15 @@
 #include "paddle/phi/kernels/reshape_kernel.h"
 #include "paddle/phi/kernels/split_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #ifdef __NVCC__
 #include <cuda.h>
 #include <cuda_runtime.h>
 #elif defined(__HIPCC__)
 #include <hip/hip_runtime.h>
+#elif defined(__MUSACC__)
+#include <musa_runtime.h>
+#include <musa.h>
 #endif
 #endif
 
@@ -301,7 +304,7 @@ static void CalCompressedDimsWith1AndWithout1(
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T>
 __global__ void range_cuda_kernel(int64_t N, T* out) {
   int64_t idx = threadIdx.x + blockDim.x * blockIdx.x;
diff --git a/paddle/phi/kernels/funcs/interpolate_function.h b/paddle/phi/kernels/funcs/interpolate_function.h
index bbfc54e5e2dc03..8dafc186d746e8 100644
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -19,7 +19,7 @@
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/primitive/datamover_primitives.h"
 #endif
 
@@ -153,7 +153,7 @@ inline std::vector<T> get_new_data_from_tensor(
   return vec_new_data;
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 using phi::kps::details::FastDivMod;
 
 struct FastDivModForInterpolate {
diff --git a/paddle/phi/kernels/funcs/isfinite_functor.h b/paddle/phi/kernels/funcs/isfinite_functor.h
index d10e7998ba8067..998fe4349999f0 100644
--- a/paddle/phi/kernels/funcs/isfinite_functor.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -20,7 +20,7 @@ namespace funcs {
 template <typename T, class Enable = void>
 struct IsNanFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     return ::isnan(a);
 #else
     return std::isnan(a);
@@ -55,7 +55,7 @@ struct IsNanFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsInfFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     return ::isinf(a);
 #else
     return std::isinf(a);
@@ -86,7 +86,7 @@ struct IsInfFunctor<phi::dtype::bfloat16, void> {
 template <typename T, class Enable = void>
 struct IsFiniteFunctor {
   HOSTDEVICE bool operator()(const T& a) const {
-#if defined(__CUDACC__) || defined(__HIPCC__)
+#if defined(__CUDACC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     return ::isfinite(a);
 #else
     return std::isfinite(a);
diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
index 6a82875819161b..e25c43b719c1cf 100644
--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/layer_norm_util.h b/paddle/phi/kernels/funcs/layer_norm_util.h
index d971f1b7a264a0..fabcf25db4bd63 100644
--- a/paddle/phi/kernels/funcs/layer_norm_util.h
+++ b/paddle/phi/kernels/funcs/layer_norm_util.h
@@ -36,7 +36,7 @@ struct RowwiseMean2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class RowwiseMean2D<phi::GPUContext, T> {
  public:
@@ -93,7 +93,7 @@ struct ColwiseSum2D {
                   DenseTensor* vec);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 class ColwiseSum2D<phi::GPUContext, T> {
  public:
diff --git a/paddle/phi/kernels/funcs/load_store_util.h b/paddle/phi/kernels/funcs/load_store_util.h
index 848f7d0b40bd8f..28c033d290b23f 100644
--- a/paddle/phi/kernels/funcs/load_store_util.h
+++ b/paddle/phi/kernels/funcs/load_store_util.h
@@ -20,7 +20,7 @@
 namespace phi {
 namespace funcs {
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 template <typename T>
 __device__ __inline__ T ClipFunc(const T v, const T min, const T max) {
   if (v > max) return max;
diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
index 1a6cca7f11aaed..7c2950fc9d3b85 100644
--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
+++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -17,6 +17,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_fp16.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
@@ -186,7 +191,7 @@ typedef unsigned warp_mask_t;
 template <typename T>
 __inline__ __device__ T WarpReduceSum(T val, warp_mask_t lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
+#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
     val += __shfl_xor_sync(lane_mask, val, mask, warpSize);
 #else
     val += __shfl_xor(val, mask, warpSize);
@@ -259,7 +264,7 @@ __inline__ __device__ T BlockReduceSumV2(T *val) {
 template <typename T>
 __inline__ __device__ T WarpReduceMax(T val, warp_mask_t lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
+#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
     val = max(val, __shfl_xor_sync(lane_mask, val, mask, warpSize));
 #else
     val = max(val, __shfl_xor(val, mask, warpSize));
@@ -282,7 +287,7 @@ __inline__ __device__ T WarpReduceMaxV2(T *val) {
 template <typename T>
 __inline__ __device__ T WarpReduceMin(T val, warp_mask_t lane_mask) {
   for (int mask = HALF_WARP; mask > 0; mask >>= 1)
-#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
+#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
     val = min(val, __shfl_xor_sync(lane_mask, val, mask, warpSize));
 #else
     val = min(val, __shfl_xor(val, mask, warpSize));
@@ -294,7 +299,7 @@ __inline__ __device__ T WarpReduceMin(T val, warp_mask_t lane_mask) {
  * threads are less than warpSize.*/
 template <typename T>
 __inline__ __device__ T PartialWarpReduceMin(T val, warp_mask_t lane_mask) {
-#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
+#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000))  || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
   T warp_val = __shfl_sync(lane_mask, val, 0, warpSize);
 #else
   T warp_val = __shfl(
@@ -303,7 +308,7 @@ __inline__ __device__ T PartialWarpReduceMin(T val, warp_mask_t lane_mask) {
   warp_val = val;
 
   for (int offset = HALF_WARP; offset > 0; offset >>= 1)
-#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
+#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
     warp_val =
         min(warp_val, __shfl_down_sync(lane_mask, warp_val, offset, warpSize));
 #else
@@ -403,7 +408,7 @@ __inline__ __device__ T PartialBlockReduceMin(T val, warp_mask_t mask) {
   __syncwarp();
 #endif
 
-#if defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)
+#if (defined(PADDLE_WITH_CUDA) && (__CUDA_ARCH__ >= 350 && CUDA_VERSION >= 9000)) || (defined(PADDLE_WITH_MUSA) && defined(__MUSACC__))
   val = __shfl_sync(mask, shared[lane], 0, warpSize);
 #else
   val = __shfl(shared[lane], 0, warpSize);
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 84df3c4b139aa4..285758655b17d1 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -242,7 +242,7 @@ void set_constant(const phi::DeviceContext& context,
     return;
   }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // tensor->place().apply_visitor(func);
   phi::VisitPlace(tensor->place(), func);
 #elif defined(PADDLE_WITH_XPU)
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index 5e5834bf91e307..17936e98393c4c 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -31,7 +31,7 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 void BatchTranspose(T* output,
                     const T* input,
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
index c0ea7ad84c41b1..f87344ab2c72fe 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cu
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -24,7 +24,7 @@ template <typename Context, typename T>
 void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
                                                   const DenseTensor& a,
                                                   DenseTensor* a_inv) {
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
   const auto& mat_dims = a.dims();
   const int rank = mat_dims.size();
   int n = mat_dims[rank - 1];
diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu
index 0bd1522e9f58ee..51733b91e29b74 100644
--- a/paddle/phi/kernels/funcs/matrix_solve.cu
+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
@@ -26,7 +26,7 @@ void MatrixSolveFunctor<Context, T>::operator()(const Context& context,
                                                 const DenseTensor& a,
                                                 const DenseTensor& b,
                                                 DenseTensor* out) {
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
   // solve the equation: Ax = B,
   // use cuBlas cublas<S/D>getrfBatched funcion to performs the LU
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
index d6cf68c092317e..810333ac1d320d 100644
--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T>
 static void GetModebySort(const phi::GPUContext& dev_ctx,
                           const DenseTensor* input_tensor,
diff --git a/paddle/phi/kernels/funcs/mufft_util.h b/paddle/phi/kernels/funcs/mufft_util.h
new file mode 100644
index 00000000000000..c33890e3e3ac45
--- /dev/null
+++ b/paddle/phi/kernels/funcs/mufft_util.h
@@ -0,0 +1,130 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+
+#include "paddle/phi/backends/dynload/mufft.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/kernels/funcs/fft.h"
+#include "paddle/phi/kernels/funcs/fft_key.h"
+
+namespace phi {
+namespace funcs {
+namespace detail {
+
+// An RAII encapsulation of muFFTHandle
+class MUFFTHandle {
+ public:
+  MUFFTHandle() {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mufftCreate(&handle_));
+  }
+
+  MUFFTHandle(const MUFFTHandle& other) = delete;
+  MUFFTHandle& operator=(const MUFFTHandle& other) = delete;
+
+  MUFFTHandle(MUFFTHandle&& other) = delete;
+  MUFFTHandle& operator=(MUFFTHandle&& other) = delete;
+
+  ::mufftHandle& get() { return handle_; }
+  const ::mufftHandle& get() const { return handle_; }
+
+  ~MUFFTHandle() { phi::dynload::mufftDestroy(handle_); }
+
+ private:
+  ::mufftHandle handle_;
+};
+
+class FFTConfig {
+ public:
+  using plan_size_type = int;
+  explicit FFTConfig(const FFTConfigKey& key)
+      : FFTConfig(
+            std::vector<int64_t>(key.sizes_, key.sizes_ + key.signal_ndim_ + 1),
+            key.fft_type_,
+            key.value_type_) {}
+  FFTConfig(const std::vector<int64_t>& sizes,
+            FFTTransformType fft_type,
+            DataType precision)
+      : fft_type_(fft_type), precision_(precision) {
+    std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
+    const auto batch_size = static_cast<plan_size_type>(sizes[0]);
+    const int signal_ndim = sizes.size() - 1;
+
+    mufftType exec_type = [&]() {
+      if (precision == DataType::FLOAT32) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return MUFFT_C2C;
+          case FFTTransformType::R2C:
+            return MUFFT_R2C;
+          case FFTTransformType::C2R:
+            return MUFFT_C2R;
+        }
+      } else if (precision == DataType::FLOAT64) {
+        switch (fft_type) {
+          case FFTTransformType::C2C:
+            return MUFFT_Z2Z;
+          case FFTTransformType::R2C:
+            return MUFFT_D2Z;
+          case FFTTransformType::C2R:
+            return MUFFT_Z2D;
+        }
+      }
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Only transforms of type float32 and float64 are supported."));
+    }();
+
+    // disable auto allocation of workspace to use allocator from the framework
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mufftSetAutoAllocation(plan(), /* autoAllocate */ 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mufftMakePlanMany(plan(),
+                                         signal_ndim,
+                                         signal_sizes.data(),
+                                         /* inembed */ nullptr,
+                                         /* base_istride */ 1,
+                                         /* idist */ 1,
+                                         /* onembed */ nullptr,
+                                         /* base_ostride */ 1,
+                                         /* odist */ 1,
+                                         exec_type,
+                                         batch_size,
+                                         &ws_size_));
+  }
+
+  const mufftHandle& plan() const { return plan_.get(); }
+  FFTTransformType transform_type() const { return fft_type_; }
+  DataType data_type() const { return precision_; }
+  size_t workspace_size() const { return ws_size_; }
+
+ private:
+  MUFFTHandle plan_;
+  size_t ws_size_;  // workspace size in bytes
+  FFTTransformType fft_type_;
+  DataType precision_;
+};
+
+static void exec_plan(const FFTConfig& config,
+                      void* in_data,
+                      void* out_data,
+                      bool forward) {
+  auto& plan = config.plan();
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mufftXtExec(
+      plan, in_data, out_data, forward ? MUFFT_FORWARD : MUFFT_INVERSE));
+}
+
+}  // namespace detail
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
index 50d7c39b198a56..94b8ec430f34f9 100644
--- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
@@ -18,6 +18,12 @@
 
 #include <cub/cub.cuh>  // NOLINT
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 
@@ -569,7 +575,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &context,
             FINAL_MASK);
       } else {
         if (bias_is_mask) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
+#if defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
           PADDLE_ENFORCE_EQ(bias_is_mask,
                             false,
                             phi::errors::InvalidArgument(
@@ -611,7 +617,7 @@ inline void MatmulWithHeadQK(const phi::GPUContext &context,
             FINAL_MASK);
       } else {
         if (bias_is_mask) {
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
+#if defined(__HIPCC__)  || defined(__MUSACC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700)
           PADDLE_ENFORCE_EQ(bias_is_mask,
                             false,
                             phi::errors::InvalidArgument(
diff --git a/paddle/phi/kernels/funcs/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h
index 0d8fa486cc065a..e606b2c471dc62 100644
--- a/paddle/phi/kernels/funcs/norm_utils.cu.h
+++ b/paddle/phi/kernels/funcs/norm_utils.cu.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <cfloat>
 #include <string>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/pooling.h b/paddle/phi/kernels/funcs/pooling.h
index 1ffd747735543c..9b6be38a76dc70 100644
--- a/paddle/phi/kernels/funcs/pooling.h
+++ b/paddle/phi/kernels/funcs/pooling.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #endif
 
@@ -115,7 +115,7 @@ HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool2dDirectCUDAFunctor {
  public:
@@ -211,7 +211,7 @@ class MaxPool2dGradFunctor {
                   DenseTensor* input_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename PoolProcess, typename T>
 class Pool3dDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 564c02c9f9f79b..f0fe802044d686 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA, XPU and HIP use same api
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 
 #include <algorithm>
 #include <cmath>
@@ -23,7 +23,7 @@
 #include <set>
 #include <vector>
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 
@@ -67,7 +67,7 @@ using dim3 = phi::kps::dim3;
 namespace phi {
 namespace funcs {
 
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 namespace details {
 
 // Check if reduce rand is valid
diff --git a/paddle/phi/kernels/funcs/segmented_array.h b/paddle/phi/kernels/funcs/segmented_array.h
index e6ecb9819e5054..bfd30fcd43069d 100644
--- a/paddle/phi/kernels/funcs/segmented_array.h
+++ b/paddle/phi/kernels/funcs/segmented_array.h
@@ -21,7 +21,7 @@
 namespace phi {
 namespace funcs {
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #if !defined(_WIN32)
 #define PADDLE_ALIGN(x) __attribute__((aligned(x)))
 #else
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 1afcad9f0f918c..543ba96c7d5604 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -15,8 +15,8 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#ifdef __NVCC__
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/funcs/skip_layernorm_functor.cu b/paddle/phi/kernels/funcs/skip_layernorm_functor.cu
index 80055b29f67117..ae0aa059bc443c 100644
--- a/paddle/phi/kernels/funcs/skip_layernorm_functor.cu
+++ b/paddle/phi/kernels/funcs/skip_layernorm_functor.cu
@@ -81,7 +81,7 @@ __global__ void SkipLayerNormKernel(int num,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormKernel
+#ifdef __MUSACC__  // @{ Half kernel: SkipLayerNormKernel
 template <>
 __global__ void SkipLayerNormKernel<half, 256>(int num,
                                                int hidden,
@@ -169,7 +169,7 @@ __global__ void SkipLayerNormKernel2(int num,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormKernel2
+#ifdef __MUSACC__  // @{ Half kernel: SkipLayerNormKernel2
 template <>
 __global__ void SkipLayerNormKernel2<half, half2, 256>(int num,
                                                        int hidden,
@@ -256,7 +256,7 @@ __global__ void SkipLayerNormSmallKernel(int num,
 }
 
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifndef __HIPCC__  // @{ Half kernel: SkipLayerNormSmallKernel
+#ifdef __MUSACC__  // @{ Half kernel: SkipLayerNormSmallKernel
 template <>
 __global__ void SkipLayerNormSmallKernel<half, 32>(int num,
                                                    int hidden,
@@ -377,7 +377,7 @@ void SkipLayerNormFunctor<T>::operator()(const int num,
                 reinterpret_cast<const float2 *>(bias),
                 eps);
 // HIP defined __HIP_NO_HALF_CONVERSIONS__ in hip.cmake
-#ifndef __HIPCC__
+#ifdef __MUSACC__
       } else if (std::is_same<T, __half>::value) {
         SkipLayerNormKernel2<__half, __half2, threads>
             <<<block, threads, 0, stream>>>(
diff --git a/paddle/phi/kernels/funcs/skip_layernorm_functor.h b/paddle/phi/kernels/funcs/skip_layernorm_functor.h
index 65b32f7c6b690f..ea64d477e11308 100644
--- a/paddle/phi/kernels/funcs/skip_layernorm_functor.h
+++ b/paddle/phi/kernels/funcs/skip_layernorm_functor.h
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <musa_runtime.h>
+#include <cub/cub.cuh>  // NOLINT
+#endif
+
 #ifdef PADDLE_WITH_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
diff --git a/paddle/phi/kernels/funcs/softmax.cu b/paddle/phi/kernels/funcs/softmax.cu
index c7dfd0c0978c00..796a5187659044 100644
--- a/paddle/phi/kernels/funcs/softmax.cu
+++ b/paddle/phi/kernels/funcs/softmax.cu
@@ -60,6 +60,21 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
                                             context.template Alloc<T>(Y),
                                             MIOPEN_SOFTMAX_ACCURATE,
                                             MIOPEN_SOFTMAX_MODE_INSTANCE));
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  mudnnTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnSoftmaxForward(context.cudnn_handle(),
+                                            CudnnDataType<T>::kOne(),
+                                            cudnn_x_desc,
+                                            X->data<T>(),
+                                            CudnnDataType<T>::kZero(),
+                                            cudnn_y_desc,
+                                            context.template Alloc<T>(Y),
+                                            MIOPEN_SOFTMAX_ACCURATE,
+                                            MIOPEN_SOFTMAX_MODE_INSTANCE));                                       
 #else
   cudnnTensorDescriptor_t cudnn_x_desc =
       xDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -117,6 +132,25 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
                                              context.template Alloc<T>(XGrad),
                                              MIOPEN_SOFTMAX_ACCURATE,
                                              MIOPEN_SOFTMAX_MODE_INSTANCE));
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  mudnnTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  mudnnTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::mudnnSoftmaxBackward(context.cudnn_handle(),
+                                             CudnnDataType<T>::kOne(),
+                                             cudnn_y_desc,
+                                             Y->data<T>(),
+                                             cudnn_ygrad_desc,
+                                             YGrad->data<T>(),
+                                             CudnnDataType<T>::kZero(),
+                                             cudnn_xgrad_desc,
+                                             context.template Alloc<T>(XGrad),
+                                             MIOPEN_SOFTMAX_ACCURATE,
+                                             MIOPEN_SOFTMAX_MODE_INSTANCE));                                      
 #else
   cudnnTensorDescriptor_t cudnn_y_desc =
       yDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -149,7 +183,7 @@ template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
 #endif
 
 // MIOPEN do not support double
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif
diff --git a/paddle/phi/kernels/funcs/softmax.h b/paddle/phi/kernels/funcs/softmax.h
index 80805eb6d76f65..1198b80a9e879e 100644
--- a/paddle/phi/kernels/funcs/softmax.h
+++ b/paddle/phi/kernels/funcs/softmax.h
@@ -37,7 +37,7 @@ class SoftmaxGradFunctor {
                   phi::DenseTensor* x_grad);
 };
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T, typename DeviceContext>
 class SoftmaxCUDNNFunctor {
  public:
diff --git a/paddle/phi/kernels/funcs/sparse/softmax.cu.h b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
index b75f870970a314..07b4e69b5dafa6 100644
--- a/paddle/phi/kernels/funcs/sparse/softmax.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/softmax.cu.h
@@ -27,6 +27,8 @@ inline DenseTensor GetOffsets(const Context& dev_ctx,
                               const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
@@ -87,6 +89,8 @@ std::tuple<DenseTensor, DenseTensor, DenseTensor, DenseTensor> ComputePoolMax(
     const IntT dim) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());  
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
index f6d67488d1f488..09236974dc296f 100644
--- a/paddle/phi/kernels/funcs/sparse/sparse_blas.h
+++ b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -100,3 +100,7 @@ inline SparseBlasT<DeviceContext, T> GetSparseBlas(
 #if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402
 #include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h"
 #endif
+
+#if defined(PADDLE_WITH_MUSA)
+#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.mu.h"
+#endif
\ No newline at end of file
diff --git a/paddle/phi/kernels/funcs/squared_l2_norm.h b/paddle/phi/kernels/funcs/squared_l2_norm.h
index c77552822bbfb7..08e518fa7c6e0b 100644
--- a/paddle/phi/kernels/funcs/squared_l2_norm.h
+++ b/paddle/phi/kernels/funcs/squared_l2_norm.h
@@ -18,9 +18,9 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
@@ -54,7 +54,7 @@ void SquaredL2Norm(const phi::CPUContext& ctx,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T1, typename T2 = T1>
 void SquaredL2Norm(const phi::GPUContext& ctx,
                    const T1* x,
diff --git a/paddle/phi/kernels/funcs/strided_memcpy.h b/paddle/phi/kernels/funcs/strided_memcpy.h
index b91ab85c55b33c..b2b6598c4b2704 100644
--- a/paddle/phi/kernels/funcs/strided_memcpy.h
+++ b/paddle/phi/kernels/funcs/strided_memcpy.h
@@ -56,7 +56,7 @@ inline void CopyWithContext(const Context& ctx,
                             const Place& src_place,
                             const void* src,
                             size_t num) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA) || \
     defined(PADDLE_WITH_CUSTOM_DEVICE)
   memory_utils::Copy(dst_place, dst, src_place, src, num, ctx.stream());
 #else
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
index 31502804f7f4e1..44f19988ad3b05 100644
--- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
+++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include <cstdio>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -1126,6 +1126,15 @@ bool SortTopk(const phi::GPUContext& ctx,
                  << hipGetErrorString(err);
       return false;
     }
+#elif defined(__MUSACC__)
+    if (err != musaSuccess) {
+      LOG(ERROR) << "TopKOP failed as could not launch "
+                    "musacub::DeviceSegmentedRadixSort::SortPairsDescending to "
+                    "calculate "
+                    "temp_storage_bytes, status: "
+                 << musaGetErrorString(err);
+      return false;
+    }
 #else
     if (err != cudaSuccess) {
       LOG(ERROR)
@@ -1151,12 +1160,12 @@ bool SortTopk(const phi::GPUContext& ctx,
                                                  0,
                                                  sizeof(T) * 8,
                                                  cu_stream);
-#ifdef __HIPCC__
-    if (err != hipSuccess) {
+#ifdef __MUSACC__
+    if (err != musaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
                     "hipcub::DeviceSegmentedRadixSort::SortPairs to calculate "
                     "temp_storage_bytes, status: "
-                 << hipGetErrorString(err);
+                 << musaGetErrorString(err);
       return false;
     }
 #else
@@ -1187,14 +1196,14 @@ bool SortTopk(const phi::GPUContext& ctx,
         0,
         sizeof(T) * 8,
         cu_stream);
-#ifdef __HIPCC__
-    if (err != hipSuccess) {
+#ifdef __MUSACC__
+    if (err != musaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
                     "hipcub::DeviceSegmentedRadixSort::SortPairsDescending to "
                     "sort input, "
                     "temp_storage_bytes: "
                  << temp_storage_bytes
-                 << ", status: " << hipGetErrorString(err);
+                 << ", status: " << musaGetErrorString(err);
       return false;
     }
 #else
@@ -1223,14 +1232,14 @@ bool SortTopk(const phi::GPUContext& ctx,
                                                  0,
                                                  sizeof(T) * 8,
                                                  cu_stream);
-#ifdef __HIPCC__
-    if (err != hipSuccess) {
+#ifdef __MUSACC__
+    if (err != musaSuccess) {
       LOG(ERROR) << "TopKOP failed as could not launch "
                     "hipcub::DeviceSegmentedRadixSort::SortPairs to "
                     "sort input, "
                     "temp_storage_bytes: "
                  << temp_storage_bytes
-                 << ", status: " << hipGetErrorString(err);
+                 << ", status: " << musaGetErrorString(err);
       return false;
     }
 #else
diff --git a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
index ef803f0ea5f3dc..614f930af964c3 100644
--- a/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
@@ -173,3 +173,4 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act,
                    phi::fusion::cutlass_internal::FusedConv2dAddActKernel,
                    float,
                    phi::dtype::float16) {}
+
diff --git a/paddle/phi/kernels/fusion/gpu/block_attn.h b/paddle/phi/kernels/fusion/gpu/block_attn.h
index 73be0901c6f36e..22f2a688949ea6 100644
--- a/paddle/phi/kernels/fusion/gpu/block_attn.h
+++ b/paddle/phi/kernels/fusion/gpu/block_attn.h
@@ -18,7 +18,6 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/funcs/quant_dequant.h"
 #include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
-
 namespace phi {
 namespace fusion {
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
index 2629ee4fdd6b99..fb5da0a33d3670 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu
@@ -21,7 +21,7 @@ PHI_DECLARE_bool(use_fast_math);
 namespace phi {
 namespace fusion {
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 template <typename T,
           typename Functor,
           int VecSize,
@@ -448,7 +448,7 @@ void FusedBiasActKernel(const Context &dev_ctx,
                         float quant_max_bound,
                         float quant_min_bound,
                         DenseTensor *out) {
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
   int rows = x.dims()[0];
   int cols = x.dims()[1];
   if (x.dtype() == phi::DataType::INT32) {
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
index 96d159e091f140..848b34837f8e39 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_act_utils.h
@@ -23,7 +23,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include "paddle/phi/kernels/funcs/load_store_util.h"
 #include "paddle/phi/kernels/gpu/gelu_funcs.h"
 #endif
@@ -33,7 +33,7 @@
 namespace phi {
 namespace fusion {
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 template <typename T>
 struct GeluComputeType;
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
index 0f93e21553a74b..aebe45dcdb7285 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include <cuda_fp16.h>
 #include <cub/cub.cuh>
 #endif
@@ -21,7 +21,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
 #endif
 
@@ -51,7 +51,7 @@ void FusedBiasDropoutResidualLnGradKernel(
     DenseTensor* bias_grad,
     DenseTensor* ln_scale_grad,
     DenseTensor* ln_bias_grad) {
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
   using U = LayerNormParamType<T>;
   auto* d_y_data = y_grad.data<T>();
   auto* ln_scale_data =
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
index fd1f754cc9827a..746023b88bc6d1 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
 #endif
 
@@ -42,7 +42,7 @@ void FusedBiasDropoutResidualLnKernel(
     DenseTensor* dropout_mask_out,
     DenseTensor* ln_mean,
     DenseTensor* ln_variance) {
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
   using U = phi::funcs::LayerNormParamType<T>;
   auto* x_data = x.data<T>();
   auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
index e795d37ea490e1..5a93d8e4fd565b 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_grad_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
index 700141f1e03318..7ae4b29af0df53 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_activation_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
index 894903fb0fab83..69c2fb2683978c 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_grad_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
index 52152476e4aca1..04b629db9e201f 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bn_add_activation_kernel.cu
@@ -17,7 +17,7 @@
 #include <string>
 #include <vector>
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 4d06cc27a34e34..24765777a96eaa 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -108,6 +108,10 @@ __global__ void VectorizedDropoutBackward(
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -202,7 +206,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                        ? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
                        : NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
         <<<grid_size, block_size, 0, stream>>>(0,
                                                numel,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index e4effaf6be28c4..1229806e172477 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -94,6 +94,10 @@ __global__ void VectorizedDropoutForward(
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
   using SType = hiprandStatePhilox4_32_10_t;
+#elif defined(PADDLE_WITH_MUSA)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx + THREAD_ID_X, increment, &state);
+  using SType = murandStatePhilox4_32_10_t;
 #else
   curandStatePhilox4_32_10_t state;
   curand_init(seed, idx + THREAD_ID_X, increment, &state);
@@ -186,7 +190,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     auto dst_functor =
         NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
         <<<grid_size, block_size, 0, stream>>>(0,
                                                numel,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
index 2d3b2938a09a07..98ac36321c1c84 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_fc_elementwise_layernorm_kernel.cu
@@ -23,8 +23,8 @@
 namespace cub = hipcub;
 #endif
 
-#if defined(PADDLE_WITH_CUDA)
-#include <cuda_fp16.h>
+#if defined(__MUSACC__)
+#include <musa_fp16.h>
 #endif
 
 #include "paddle/common/errors.h"
diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
index e31b24e7e105e5..b162e725647f51 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -33,12 +33,13 @@ limitations under the License.
 // https://github.com/Oneflow-Inc/oneflow/blob/master/oneflow/core/cuda/layer_norm.cuh
 // The following code modified from OneFlow's implementation, and change to use
 // single Pass algorithm. Support Int8 quant, dequant Load/Store implementation.
+#include "paddle/phi/core/enforce.h"
 
 #include <assert.h>
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include <cub/cub.cuh>
 #include "paddle/phi/kernels/fusion/gpu/attention_layer.norm.h"
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
@@ -50,7 +51,7 @@ namespace fusion {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
 constexpr int kWarpSize = 32;
 
@@ -940,8 +941,8 @@ void FusedLayerNormKernel(const Context& dev_ctx,
                           DenseTensor* residual_out,
                           DenseTensor* mean,
                           DenseTensor* variance) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+        PADDLE_ENFORCE(false, "please compile with cuda!");
 #else
   using U = phi::funcs::LayerNormParamType<T>;
   const T* x_data = x.data<T>();
@@ -1066,7 +1067,7 @@ void FusedLayerNormKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #if CUDNN_VERSION_MIN(8, 1, 0)
 PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
                    GPU,
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h
index 32dc8aa07dec41..ed93b2a7ba2d63 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_utils.h
@@ -18,6 +18,12 @@
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
+
+#ifdef PADDLE_WITH_CUDA
+#include <muda.h>
+#include <murand_kernel.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
@@ -94,7 +100,7 @@ __device__ __forceinline__ T warp_shfl_xor_upper_tri(T value,
                                                      int laneMask,
                                                      int width,
                                                      unsigned int mask = MASK) {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(__MUSACC__)
   return __shfl_xor_sync(mask, value, laneMask, width);
 #else
   return __shfl_xor(value, laneMask, width);
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
index c1d60cbffee2fa..72389836f0a623 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -18,6 +18,12 @@
 #include <cuda.h>
 #include <curand_kernel.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa.h>
+#include <murand_kernel.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
@@ -25,7 +31,7 @@
 
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifdef PADDLE_WITH_HIP
 #define WARP_SIZE 64
@@ -71,7 +77,7 @@ struct MaxOP {
 template <typename T>
 __device__ __forceinline__ T
 warp_shfl_xor(T value, int laneMask, int width, unsigned int mask = MASK) {
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= 9000 || defined(__MUSACC__)
   return __shfl_xor_sync(mask, value, laneMask, width);
 #else
   return __shfl_xor(value, laneMask, width);
diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
index e9b2b8eb0cbe6f..33d97b0b1f6655 100644
--- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
@@ -20,7 +20,7 @@
 namespace phi {
 namespace fusion {
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
 constexpr unsigned int str2int(const char *str, int h = 0) {
   return !str[h] ? 5381 : (str2int(str, h + 1) * 33) ^ str[h];
@@ -1044,7 +1044,7 @@ void MMHAKernel(const Context &dev_ctx,
                 DenseTensor *out,
                 DenseTensor *cache_kv_out,
                 DenseTensor *beam_cache_offset_out) {
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
   if (x.dtype() == phi::DataType::INT32) {
     switch (str2int(compute_dtype.c_str())) {
       case str2int("fp16"):
diff --git a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
index 7d4e0c81198b12..586b2cf626fd8e 100644
--- a/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
+++ b/paddle/phi/kernels/fusion/gpu/mmha_util.cu.h
@@ -47,7 +47,7 @@
     \brief Functor used by mmha kernel.
 */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #pragma once
 
 #if defined(__CUDACC__) && CUDA_VERSION >= 11000
diff --git a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
index c970f50eb117ad..dd60ca310d8b35 100644
--- a/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/multihead_matmul_kernel.cu
@@ -329,6 +329,8 @@ void MultiheadMatmulKernel(const Context &dev_ctx,
         &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
 #ifdef PADDLE_WITH_HIP
     hipMemset(temp_qk_bias, 0, sizeof(float) * size);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #else
     cudaMemset(temp_qk_bias, 0, sizeof(float) * size);
 #endif
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 2a1c6759bbc8ba..1c2b539d470c0c 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -234,10 +234,7 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh,
                                                t_min,
                                                t_max);
 
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
-                                               CudaSTanhGradFunctor,
-                                               scale_a,
-                                               scale_b);
+
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
                                                CudaSoftplusGradFunctor,
@@ -379,7 +376,12 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(thresholded_relu_grad,
                                    ThresholdedReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
-PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
+//TODO:MCC COMPILE ERROR
+// PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(stanh_grad, STanhGradKernel)
+// DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
+//                                                CudaSTanhGradFunctor,
+//                                                scale_a,
+//                                                scale_b);
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(reciprocal_grad, ReciprocalGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softplus_grad,
                                                 SoftplusGradKernel)
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 34bbbfbd11859e..a86ba904999c08 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -89,7 +89,7 @@ void ActivationGPUImpl(const Context& dev_ctx,
   }
 
 DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
+// DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sin, CudaSinFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Asin, CudaAsinFunctor)
@@ -100,7 +100,7 @@ DEFINE_GPU_ACTIVATION_KERNEL(Asinh, CudaAsinhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acosh, CudaAcoshFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Atanh, CudaAtanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Relu, CudaReluFunctor)
-DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
+// DEFINE_GPU_ACTIVATION_KERNEL(Tanh, CudaTanhFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Reciprocal, CudaReciprocalFunctor)
@@ -138,7 +138,7 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh,
                                      CudaHardTanhFunctor,
                                      t_min,
                                      t_max)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
+// DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
                                      CudaSoftplusFunctor,
                                      beta,
@@ -231,7 +231,7 @@ PD_REGISTER_KERNEL(relu,
 
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sin, SinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cos, CosKernel)
-PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tan, TanKernel)
+// PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tan, TanKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acos, AcosKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asin, AsinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atan, AtanKernel)
@@ -240,13 +240,13 @@ PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cosh, CoshKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(asinh, AsinhKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(acosh, AcoshKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(atanh, AtanhKernel)
-PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tanh, TanhKernel)
+// PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(tanh, TanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hardtanh, HardTanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(thresholded_relu, ThresholdedReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
-PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel)
+// PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(stanh, StanhKernel)
 PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
diff --git a/paddle/phi/kernels/gpu/all_gather_kernel.cu b/paddle/phi/kernels/gpu/all_gather_kernel.cu
index ca6bfd7b4517be..563733243c4913 100644
--- a/paddle/phi/kernels/gpu/all_gather_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_gather_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -28,7 +28,7 @@ void AllGatherKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      int nranks,
                      DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto out_dims = x.dims();
   out_dims[0] *= nranks;
   out->Resize(out_dims);
diff --git a/paddle/phi/kernels/gpu/all_reduce_kernel.cu b/paddle/phi/kernels/gpu/all_reduce_kernel.cu
index 0c920ef1bc61e8..ba0e4ee8a90e00 100644
--- a/paddle/phi/kernels/gpu/all_reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_reduce_kernel.cu
@@ -15,9 +15,10 @@
 #include "paddle/phi/kernels/all_reduce_kernel.h"
 
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -28,7 +29,7 @@ void AllReduceKernel(const Context& dev_ctx,
                      const DenseTensor& x,
                      int reduce_type,
                      DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   out->Resize(x.dims());
   dev_ctx.template Alloc<T>(out);
 
@@ -43,25 +44,27 @@ void AllReduceKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_NOT_NULL(stream,
                           errors::NotFound("Should initialize NCCL firstly."));
 
-  ncclRedOp_t red_type = ncclSum;
+  mcclRedOp_t red_type = mcclSum;
   switch (static_cast<ReduceType>(reduce_type)) {
     case ReduceType::kRedSum:
-      red_type = ncclSum;
+      red_type = mcclSum;
       break;
     case ReduceType::kRedMax:
-      red_type = ncclMax;
+      red_type = mcclMax;
       break;
     case ReduceType::kRedMin:
-      red_type = ncclMin;
+      red_type = mcclMin;
       break;
     case ReduceType::kRedProd:
-      red_type = ncclProd;
+      red_type = mcclProd;
       break;
     case ReduceType::kRedAll:
-      // NOTE(zhonghui): There is no reduce_all type of ncclRedOp_t, just use
+      // NOTE(zhonghui): There is no reduce_all type of mcclRedOp_t, just use
       // min to replace
-      red_type = ncclMin;
+      red_type = mcclMin;
       break;
+    default:
+    PADDLE_ENFORCE(false, "unsupported type");
   }
   comm_ctx->AllReduce(out, x, red_type, stream);
 #else
diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
index 6d50e2ceb1ae50..e5322c0e48f8c1 100644
--- a/paddle/phi/kernels/gpu/all_to_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #include "paddle/phi/core/distributed/utils.h"
 #endif
@@ -29,8 +29,7 @@ template <typename T, typename Context>
 void AllToAllKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#if NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   auto x_dims = x.dims();
   out->Resize(x_dims);
   dev_ctx.template Alloc<T>(out);
@@ -72,10 +71,6 @@ void AllToAllKernel(const Context& dev_ctx,
     offset += send_numel;
   }
   comm_ctx->GroupEnd();
-#else
-  PADDLE_THROW(
-      platform::errors::Unavailable("NCCL version >= 2.7.3 is needed."));
-#endif
 #else
   PADDLE_THROW(
       errors::PreconditionNotMet("PaddlePaddle should compile with GPU."));
@@ -84,7 +79,7 @@ void AllToAllKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
+// #if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
 PD_REGISTER_KERNEL(all_to_all,
                    GPU,
                    ALL_LAYOUT,
@@ -99,18 +94,18 @@ PD_REGISTER_KERNEL(all_to_all,
                    bool,
                    phi::dtype::bfloat16,
                    phi::dtype::float16) {}
-#else
-PD_REGISTER_KERNEL(all_to_all,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::AllToAllKernel,
-                   float,
-                   double,
-                   int,
-                   int8_t,
-                   uint8_t,
-                   int16_t,
-                   int64_t,
-                   bool,
-                   phi::dtype::float16) {}
-#endif
+// #else
+// PD_REGISTER_KERNEL(all_to_all,
+//                    GPU,
+//                    ALL_LAYOUT,
+//                    phi::AllToAllKernel,
+//                    float,
+//                    double,
+//                    int,
+//                    int8_t,
+//                    uint8_t,
+//                    int16_t,
+//                    int64_t,
+//                    bool,
+//                    phi::dtype::float16) {}
+// #endif
diff --git a/paddle/phi/kernels/gpu/allclose_kernel.cu b/paddle/phi/kernels/gpu/allclose_kernel.cu
index 99ccfcd8667e6d..13a65c6a64f8b7 100644
--- a/paddle/phi/kernels/gpu/allclose_kernel.cu
+++ b/paddle/phi/kernels/gpu/allclose_kernel.cu
@@ -87,6 +87,8 @@ void AllCloseKernel(const Context& dev_ctx,
   grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
   hipMemset(out_data, true, sizeof(bool));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(out_data, true, sizeof(bool));
 #else
   cudaMemset(out_data, true, sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
index 8e9d94bea6c547..d7830ed8074a41 100644
--- a/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
+++ b/paddle/phi/kernels/gpu/arg_min_max_kernel.cu
@@ -17,9 +17,9 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
index 673e2937c93a5f..4127c6718f56fa 100644
--- a/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_grad_kernel.cu
@@ -18,7 +18,8 @@
 #include <thrust/sort.h>
 
 #include "paddle/phi/kernels/argsort_kernel.h"
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/argsort_kernel.cu b/paddle/phi/kernels/gpu/argsort_kernel.cu
index 1fc367a5a88c64..ac3427fc2f9ac3 100644
--- a/paddle/phi/kernels/gpu/argsort_kernel.cu
+++ b/paddle/phi/kernels/gpu/argsort_kernel.cu
@@ -18,7 +18,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/auc_kernel.cu b/paddle/phi/kernels/gpu/auc_kernel.cu
index 5bc4b5a6283c20..449f163c02a9b6 100644
--- a/paddle/phi/kernels/gpu/auc_kernel.cu
+++ b/paddle/phi/kernels/gpu/auc_kernel.cu
@@ -214,23 +214,23 @@ void AucKernel(const Context &dev_ctx,
     }
   }
 
-#ifdef PADDLE_WITH_CUDA
+#ifdef PADDLE_WITH_MUSA
   if (stat_pos_in_tensor != stat_pos_out) {
-    cudaMemcpyAsync(
+    musaMemcpyAsync(
         origin_stat_pos,
         pos_in_data,
         ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
             sizeof(int64_t),
-        cudaMemcpyDeviceToDevice,
+        musaMemcpyDeviceToDevice,
         dev_ctx.stream());
   }
   if (stat_neg_in_tensor != stat_neg_out) {
-    cudaMemcpyAsync(
+    musaMemcpyAsync(
         origin_stat_neg,
         neg_in_data,
         ((1 + slide_steps) * (num_thresholds + 1) + (slide_steps > 0 ? 1 : 0)) *
             sizeof(int64_t),
-        cudaMemcpyDeviceToDevice,
+        musaMemcpyDeviceToDevice,
         dev_ctx.stream());
   }
 #else
diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
index c275f58ff734b9..f9fe32882a1250 100644
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -587,7 +587,7 @@ void BatchNormGradFunctor(const Context &ctx,
           new_scale.dims()[0]));
 
   auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -659,7 +659,7 @@ void BatchNormGradFunctor(const Context &ctx,
     }
 
 // ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) ||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // miopenTensorDescriptor_t data_desc_;
 // miopenTensorDescriptor_t bn_param_desc_;
@@ -685,7 +685,7 @@ void BatchNormGradFunctor(const Context &ctx,
                  << "CUDNN_BN_MIN_EPSILON instead.";
     }
     epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
@@ -704,7 +704,7 @@ void BatchNormGradFunctor(const Context &ctx,
     }
 #endif  // CUDNN_VERSION_MIN(7, 0, 1)
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
@@ -748,7 +748,7 @@ void BatchNormGradFunctor(const Context &ctx,
 
     // This branch calls CUDNN APIs
     if (d_x && d_scale && d_bias) {
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
       if (compute_format == DataLayout::kNCHW) {
         BNBackward<T, block, DataLayout::kNCHW>
             <<<grid2, block, 0, ctx.stream()>>>(
@@ -1126,7 +1126,7 @@ void BatchNormGradFunctor(const Context &ctx,
       }
     }
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
 // PADDLE_ENFORCE_GPU_SUCCESS(
@@ -1392,7 +1392,7 @@ void BatchNormDoubleGradKernel(
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 PD_DECLARE_BN_GRAD_FUNCTOR(float, GPU);
 PD_DECLARE_BN_GRAD_FUNCTOR(phi::dtype::float16, GPU);
 
@@ -1444,7 +1444,7 @@ PD_REGISTER_KERNEL(batch_norm_grad,
 #endif
 #endif
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(batch_norm_double_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
index f01a4ee860d819..7d911a0e9a96aa 100644
--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -570,7 +570,7 @@ void BatchNormKernel(const Context &ctx,
     new_bias = phi::Full<T, Context>(ctx, {C}, static_cast<T>(0));
   }
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
   auto compute_format =
       data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW;
 
@@ -602,7 +602,7 @@ void BatchNormKernel(const Context &ctx,
   }
 
 // ------------------- cudnn descriptors ---------------------
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // miopenTensorDescriptor_t data_desc_;
 // miopenTensorDescriptor_t bn_param_desc_;
@@ -630,7 +630,7 @@ void BatchNormKernel(const Context &ctx,
   }
   epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // mode_ = miopenBNSpatial;
 #elif CUDNN_VERSION_MIN(7, 0, 1)
@@ -660,7 +660,7 @@ void BatchNormKernel(const Context &ctx,
     strides = {H * W * D * C, 1, W * D * C, D * C, C};
   }
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor(
 //     data_desc_, CudnnDataType<T>::type,
@@ -732,7 +732,7 @@ void BatchNormKernel(const Context &ctx,
             est_var->dims()[0],
             est_var->dims()));
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
     const int block_size = 256;
     const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
     if (compute_format == DataLayout::kNCHW) {
@@ -901,7 +901,7 @@ void BatchNormKernel(const Context &ctx,
       phi::Copy(ctx, x, ctx.GetPlace(), false, y);
     } else {
       double this_factor = 1. - momentum;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
       this_factor = momentum;
       const int num = transformed_x.numel();
       const int block = 256;
@@ -1227,7 +1227,7 @@ void BatchNormKernel(const Context &ctx,
     VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
     TransToChannelLast<Context, T>(ctx, &transformed_y, y);
   }
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 // TODO(wangran16): wait for MIOpen to improve the performance of BN
 // clean when exit.
 // PADDLE_ENFORCE_GPU_SUCCESS(
@@ -1245,7 +1245,7 @@ void BatchNormKernel(const Context &ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(batch_norm,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index 5f86201f4a5755..89d16daf1e62bd 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -20,7 +20,9 @@
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
-
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #include <algorithm>
 #include <vector>
 
@@ -40,9 +42,9 @@ __global__ void bernoulli_cuda_kernel(
   size_t thread_idx =
       static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
 
-#if defined(__NVCC__)
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, thread_idx, offset, &state);
+#if defined(__MUSACC__)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, thread_idx, offset, &state);
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, thread_idx, offset, &state);
diff --git a/paddle/phi/kernels/gpu/broadcast_kernel.cu b/paddle/phi/kernels/gpu/broadcast_kernel.cu
index e4986f752b1aec..53907284442395 100644
--- a/paddle/phi/kernels/gpu/broadcast_kernel.cu
+++ b/paddle/phi/kernels/gpu/broadcast_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -33,7 +33,7 @@ void BroadcastKernel(const Context& dev_ctx,
       0,
       phi::errors::InvalidArgument("Tensor need be broadcast must not empty."));
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   dev_ctx.template Alloc<T>(out);
   gpuStream_t stream = dev_ctx.stream();
   auto comm_context =
diff --git a/paddle/phi/kernels/gpu/check_numerics_kernel.cu b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
index 082574502d0dd8..f3a357d55ce965 100644
--- a/paddle/phi/kernels/gpu/check_numerics_kernel.cu
+++ b/paddle/phi/kernels/gpu/check_numerics_kernel.cu
@@ -399,6 +399,12 @@ static char* GetGpuHintStringPtr(const phi::GPUContext& ctx,
                                                 op_var.length() + 1,
                                                 hipMemcpyHostToDevice,
                                                 ctx.stream()));
+#elif defined(__MUSACC__)
+      PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(gpu_str_ptr,
+                                                iter->first.c_str(),
+                                                op_var.length() + 1,
+                                                musaMemcpyHostToDevice,
+                                                ctx.stream()));                                                
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr,
                                                  iter->first.c_str(),
diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu
index 16e854c8de4c6a..588533dee6548a 100644
--- a/paddle/phi/kernels/gpu/cholesky_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/cholesky_kernel.h"
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
index 9be20c8025226c..c87432253cbd78 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // backward reuse forward, HIP not support forward
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
index f350106f67cf8a..be23f7acfb05c6 100644
--- a/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
+++ b/paddle/phi/kernels/gpu/cholesky_solve_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // HIP not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
diff --git a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
index af2901cd346f0a..00d92e96321ad1 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/cross_entropy_grad_kernel.h"
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -277,7 +277,7 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP)||  defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 63e52527cb9cdd..58aa44e84767a2 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -755,6 +755,8 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#elif defined(PADDLE_WITH_MUSA)
+    mudnnTensorDescriptor_t  descp = desc.descriptor<T>(layout, tensor_dims);
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
@@ -774,6 +776,19 @@ static void SoftmaxWithCrossEntropySoftLabel(const GPUContext& dev_ctx,
         softmax_data,
         MIOPEN_SOFTMAX_LOG,
         mode));
+#elif defined(PADDLE_WITH_MUSA)
+    auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
+                                 : MUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
+        handle,
+        MUDNN_SOFTMAX_LOG,
+        mode,
+        phi::backends::gpu::CudnnDataType<T>::kOne(),
+        descp,
+        logits_data,
+        phi::backends::gpu::CudnnDataType<T>::kZero(),
+        descp,
+        softmax_data));
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
@@ -1187,6 +1202,8 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW;
 #ifdef PADDLE_WITH_HIP
     miopenTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
+#elif defined(PADDLE_WITH_MUSA)
+    mudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);    
 #else
     cudnnTensorDescriptor_t descp = desc.descriptor<T>(layout, tensor_dims);
 #endif
@@ -1206,6 +1223,19 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
         softmax_data,
         MIOPEN_SOFTMAX_LOG,
         mode));
+#elif defined(PADDLE_WITH_MUSA)
+    auto mode = axis == rank - 1 ? MUDNN_SOFTMAX_MODE_INSTANCE
+                                 : MUDNN_SOFTMAX_MODE_CHANNEL;
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSoftmaxForward(
+        handle,
+        MUDNN_SOFTMAX_LOG,
+        mode,
+        phi::backends::gpu::CudnnDataType<T>::kOne(),
+        descp,
+        logits_data,
+        phi::backends::gpu::CudnnDataType<T>::kZero(),
+        descp,
+        softmax_data));
 #else
     auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
                                  : CUDNN_SOFTMAX_MODE_CHANNEL;
@@ -1448,7 +1478,7 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(cross_entropy_with_softmax,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpu/cum_kernel.cu b/paddle/phi/kernels/gpu/cum_kernel.cu
index 831b225c61d844..80ca363a958a48 100644
--- a/paddle/phi/kernels/gpu/cum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cum_kernel.cu
@@ -229,6 +229,8 @@ ThrustCumsumKernel(const Context& dev_ctx,
                    bool exclusive) {
 #ifdef __HIPCC__
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
+#elif defined(__MUSACC__)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
 #else
   phi::memory_utils::ThrustAllocator<cudaStream_t> allocator(dev_ctx.GetPlace(),
                                                              dev_ctx.stream());
diff --git a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
index fdd9b4ba499146..66503c2f446bf6 100644
--- a/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumprod_grad_kernel.cu
@@ -181,6 +181,8 @@ void CumprodGradKernel(const Context &dev_ctx,
 // Step 1: find cummax-ed zero mask of x
 #ifdef PADDLE_WITH_CUDA
   const auto &exec_policy = thrust::cuda::par.on(dev_ctx.stream());
+#elif defined (PADDLE_WITH_MUSA)
+  const auto &exec_policy = thrust::musa::par.on(dev_ctx.stream());  
 #else
   const auto &exec_policy = thrust::hip::par.on(dev_ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
index ef6ce5d159aeb6..9690a02544563d 100644
--- a/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
+++ b/paddle/phi/kernels/gpu/decode_jpeg_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/kernels/decode_jpeg_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
index 5becc79d218f44..1dd6988b15810a 100644
--- a/paddle/phi/kernels/gpu/dgc_kernel.cu
+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
@@ -183,7 +183,7 @@ void DGCKernel(const Context& dev_ctx,
 
   int buf_size = paddle::communication::dgc::get_buffer_size(k);
   phi::Allocator::AllocationPtr tmp_ious_data;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
     tmp_ious_data = phi::memory_utils::Alloc(
         dev_ctx.GetPlace(),
diff --git a/paddle/phi/kernels/gpu/dirichlet_kernel.cu b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
index 912c84bf26c210..11398a4c69520a 100644
--- a/paddle/phi/kernels/gpu/dirichlet_kernel.cu
+++ b/paddle/phi/kernels/gpu/dirichlet_kernel.cu
@@ -28,6 +28,11 @@
 #ifdef PADDLE_WITH_CUDA
 #include <curand_kernel.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <murand_kernel.h>
+#endif
+
 #ifdef PADDLE_WITH_HIP
 #include <hiprand_kernel.h>
 #endif
@@ -37,6 +42,13 @@ using COMPAT_RANDSTATEPHILOX4_32_10_T = curandStatePhilox4_32_10_t;
 #define COMPAT_RAND_INIT curand_init
 #define COMPAT_RAND_UNIFORM curand_uniform
 #define COMPAT_RAND_NORMAL curand_normal
+
+#elif defined(PADDLE_WITH_MUSA)
+using COMPAT_RANDSTATEPHILOX4_32_10_T = murandStatePhilox4_32_10_t;
+#define COMPAT_RAND_INIT murand_init
+#define COMPAT_RAND_UNIFORM murand_uniform
+#define COMPAT_RAND_NORMAL murand_normal
+
 #elif defined(PADDLE_WITH_HIP)
 using COMPAT_RANDSTATEPHILOX4_32_10_T = hiprandStatePhilox4_32_10_t;
 #define COMPAT_RAND_INIT hiprand_init
diff --git a/paddle/phi/kernels/gpu/dist_concat_kernel.cu b/paddle/phi/kernels/gpu/dist_concat_kernel.cu
index 75500f06299b36..eb8cca95a3290b 100644
--- a/paddle/phi/kernels/gpu/dist_concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_concat_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -28,7 +28,7 @@ void DistConcatKernel(const Context& dev_ctx,
                       const DenseTensor& x,
                       int nranks,
                       DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   DenseTensor temp_out;
   auto temp_out_dims = x.dims();
   temp_out_dims[0] *= nranks;
diff --git a/paddle/phi/kernels/gpu/dist_kernel.cu b/paddle/phi/kernels/gpu/dist_kernel.cu
index a9cbf97b975f22..a12ed3663ddd32 100644
--- a/paddle/phi/kernels/gpu/dist_kernel.cu
+++ b/paddle/phi/kernels/gpu/dist_kernel.cu
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/legacy/reduce_max_kernel.h"
 #include "paddle/phi/kernels/p_norm_kernel.h"
 #include "paddle/phi/kernels/reduce_min_kernel.h"
-
+#include "paddle/phi/common/amp_type_traits.h"
 namespace phi {
 
 #define FULL_MASK 0xffffffff
diff --git a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
index 092d2428640c89..9e389dd3896681 100644
--- a/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/distribute_fpn_proposals_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/eigh_kernel.cu b/paddle/phi/kernels/gpu/eigh_kernel.cu
index f3b33ad5c98785..1700485668bed0 100644
--- a/paddle/phi/kernels/gpu/eigh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigh_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/eigh_kernel.h"
diff --git a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
index 9671cc9f3e8d98..db2cb7ec5a1558 100644
--- a/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
+++ b/paddle/phi/kernels/gpu/eigvalsh_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
 #include "paddle/phi/kernels/eigvalsh_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index 8689f7fde8b3ba..0cc240c4bf9d5b 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -99,6 +99,9 @@ struct EmbeddingGradCUDAFunctor {
 #ifdef PADDLE_WITH_HIP
       PADDLE_ENFORCE_GPU_SUCCESS(
           hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
+#elif defined(PADDLE_WITH_MUSA)
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          musaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream())); 
 #else
       PADDLE_ENFORCE_GPU_SUCCESS(
           cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx_.stream()));
diff --git a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
index ce2f8dc2467ed0..f3cc5a1c80db7e 100644
--- a/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
+++ b/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
@@ -16,7 +16,7 @@
 
 #include <algorithm>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
index c0454619b657ca..dd78ce97bfcc4b 100644
--- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
@@ -72,6 +72,8 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
   int* item_count_ptr = reinterpret_cast<int*>(item_count->ptr());
 #ifdef PADDLE_WITH_HIP
   hipMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #else
   cudaMemset(item_count_ptr, 0, sizeof(int) * (num_input + 1));
 #endif
@@ -93,6 +95,11 @@ std::shared_ptr<phi::Allocation> FillHashTable(const Context& dev_ctx,
             item_count_ptr + num_input,
             sizeof(int),
             hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(&total_unique_items,
+            item_count_ptr + num_input,
+            sizeof(int),
+            musaMemcpyDeviceToHost);       
 #else
   cudaMemcpy(&total_unique_items,
              item_count_ptr + num_input,
@@ -344,6 +351,11 @@ void ReindexDst(const Context& dev_ctx,
               thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
               sizeof(int),
               hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(&count_i,
+              thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
+              sizeof(int),
+              musaMemcpyDeviceToHost); 
 #else
     cudaMemcpy(&count_i,
                thrust::raw_pointer_cast(dst_ptr.data()) + node_len,
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 20e1c6727ae91e..f9ecf4bf17f9d8 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -22,6 +22,9 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_runtime.h>
 #include <hiprand_kernel.h>
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_runtime.h>
+#include <murand_kernel.h>
 #else
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
@@ -82,6 +85,12 @@ __global__ void SampleKernel(const uint64_t rand_seed,
                threadIdx.y * CTA_SIZE + threadIdx.x,
                0,
                &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murandState rng;
+  murand_init(rand_seed * gridDim.x + blockIdx.x,
+               threadIdx.y * CTA_SIZE + threadIdx.x,
+               0,
+               &rng);               
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
@@ -118,6 +127,8 @@ __global__ void SampleKernel(const uint64_t rand_seed,
       for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
@@ -213,6 +224,10 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   hiprandState rng;
   hiprand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
+#elif defined(PADDLE_WITH_MUSA)
+  murandState rng;
+  murand_init(
+      rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #else
   curandStatePhilox4_32_10_t rng;
   curand_init(
@@ -236,6 +251,8 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
       for (int idx = split; idx <= deg - 1; idx++) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        const int num = murand(&rng) % (idx + 1);
 #else
         const int num = curand(&rng) % (idx + 1);
 #endif
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
index bff91078865d92..b53a85b327e65c 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -42,6 +42,15 @@ inline void CopyBCastOff(const BroadCastInfo& bcast_info,
             bcast_info.r_offset.data(),
             sizeof(int64_t) * bcast_info.out_len,
             hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
+            bcast_info.l_offset.data(),
+            sizeof(int64_t) * bcast_info.out_len,
+            musaMemcpyHostToDevice);
+  musaMemcpy(thrust::raw_pointer_cast(r_bcastoff->data()),
+            bcast_info.r_offset.data(),
+            sizeof(int64_t) * bcast_info.out_len,
+            musaMemcpyHostToDevice);
 #else
   cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff->data()),
              bcast_info.l_offset.data(),
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
index 301701c61d34ea..0e26e9e6c68fb7 100644
--- a/paddle/phi/kernels/gpu/group_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -674,6 +674,8 @@ void GroupNormNHWCKernel(const Context& dev_ctx,
   params_.redBuffer = dev_ctx.template Alloc<float>(&redBuffer);
 #ifdef PADDLE_WITH_HIP
   hipMemset(params_.redBuffer, 0, buffer_sizes * sizeof(float));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(params_.redBuffer, 0, buffer_sizes * sizeof(float));
 #else
   cudaMemset(params_.redBuffer, 0, buffer_sizes * sizeof(float));
 #endif
@@ -687,6 +689,12 @@ void GroupNormNHWCKernel(const Context& dev_ctx,
                                      params_.n * groups * sizeof(float),
                                      hipMemcpyDeviceToHost,
                                      stream);
+#elif defined(PADDLE_WITH_MUSA)           
+  phi::backends::gpu::GpuMemcpyAsync(mean_data,
+                                     params_.redBuffer,
+                                     params_.n * groups * sizeof(float),
+                                     musaMemcpyDeviceToHost,
+                                     stream);                          
 #else
   phi::backends::gpu::GpuMemcpyAsync(mean_data,
                                      params_.redBuffer,
@@ -848,6 +856,9 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
 #ifdef PADDLE_WITH_HIP
     hipMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     hipMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
+    musaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
 #else
     cudaMemset(mean, 0, sizeof(AccT) * input_ddim[0] * groups);
     cudaMemset(temp_variance, 0, sizeof(AccT) * input_ddim[0] * groups);
@@ -882,7 +893,7 @@ void GroupNormDirectCUDAFunctor<T, AccT>::operator()(
                                      data_layout);
 }
 template class GroupNormDirectCUDAFunctor<float, float>;
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP) && !defined(PADDLE_WITH_MUSA)
 template class GroupNormDirectCUDAFunctor<half, float>;
 #endif
 
diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h
index 3ea5f3bc1088d0..555403fcf115ad 100644
--- a/paddle/phi/kernels/gpu/group_norm_utils.h
+++ b/paddle/phi/kernels/gpu/group_norm_utils.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 34c034c73677b8..708c82d9d5c932 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -18,8 +18,9 @@
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
+#if defined(__NVCC__) || defined(__MUSACC__)
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/instance_norm_utils.h b/paddle/phi/kernels/gpu/instance_norm_utils.h
index 865ab91da7b1b3..10fdaabcedd467 100644
--- a/paddle/phi/kernels/gpu/instance_norm_utils.h
+++ b/paddle/phi/kernels/gpu/instance_norm_utils.h
@@ -18,7 +18,7 @@
 #include <cfloat>
 #include <string>
 #include <vector>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
index f596859fd2d575..485f06f8e7f8b9 100644
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -1035,7 +1035,7 @@ static void Interpolate2DCUDABwd(
         (align_mode == 0 && !align_corners) ? 0.5f : 0.f;
     bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false;
     bool optimize_flag = false;
-#ifndef __HIPCC__
+#if !defined(__HIPCC__) && !defined(__MUSACC__)
     optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6))
                         ? true
                         : ((in_h == 1 && in_w == 1) ? true : false);
diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
index db93d800d4ee05..7dd6d225d5cbec 100644
--- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu
+++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu
@@ -86,11 +86,11 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                                                0,
                                                sizeof(T) * 8,
                                                cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
+#ifdef __MUSACC__
+  if (err != musaSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
                   "hipcub::DeviceSegmentedRadixSort::SortPairs, status: "
-               << hipGetErrorString(err);
+               << musaGetErrorString(err);
     return false;
   }
 #else
@@ -118,11 +118,11 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx,
                                                  0,
                                                  sizeof(T) * 8,
                                                  cu_stream);
-#ifdef __HIPCC__
-  if (err != hipSuccess) {
+#ifdef __MUSACC__
+  if (err != musaSuccess) {
     LOG(ERROR) << "KthvalueOP failed as could not launch "
                   "hipcub::DeviceSegmentedRadixSort::SortPairs, "
-               << temp_storage_bytes << ", status: " << hipGetErrorString(err);
+               << temp_storage_bytes << ", status: " << musaGetErrorString(err);
     return false;
   }
 #else
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index d9757183b289c8..9314e456aeca73 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -482,7 +482,7 @@ void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
 
 template class LayerNormDirectCUDAFunctor<float, float>;
 template class LayerNormDirectCUDAFunctor<double, double>;
-#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)&& !defined(PADDLE_WITH_MUSA)
 template class LayerNormDirectCUDAFunctor<half, float>;
 #endif
 
diff --git a/paddle/phi/kernels/gpu/logsumexp_function.cu.h b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
index 53b6fb6d2b20d0..91b0a29b4ac074 100644
--- a/paddle/phi/kernels/gpu/logsumexp_function.cu.h
+++ b/paddle/phi/kernels/gpu/logsumexp_function.cu.h
@@ -69,6 +69,22 @@ inline void GetNumBlocks(int64_t block_size,
   *num_blocks = std::max<int>(
       1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
 }
+#elif defined(PADDLE_WITH_MUSA)
+inline void GetNumBlocks(int64_t block_size,
+                         int64_t max_blocks,
+                         int64_t waves,
+                         int* num_blocks) {
+  int dev;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaGetDevice(&dev));
+  int sm_count;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &sm_count, musaDevAttrMultiProcessorCount, dev));
+  int tpm;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaDeviceGetAttribute(
+      &tpm, musaDevAttrMaxThreadsPerMultiProcessor, dev));
+  *num_blocks = std::max<int>(
+      1, std::min<int64_t>(max_blocks, sm_count * tpm / block_size * waves));
+}
 #else
 inline void GetNumBlocks(int64_t block_size,
                          int64_t max_blocks,
@@ -193,6 +209,12 @@ inline hipError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                       const int64_t num_col,
                                       const SourceType* in,
                                       SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
+                                      const int64_t num_row,
+                                      const int64_t num_col,
+                                      const SourceType* in,
+                                      SourceType* out) {                                        
 #else
 inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
                                        const int64_t num_row,
@@ -222,6 +244,8 @@ inline cudaError_t LaunchLogsumexpWarp(const Context& dev_ctx,
           dev_ctx, num_row, num_col, in, out);
 #if PADDLE_WITH_HIP
   return hipPeekAtLastError();
+#elif defined(PADDLE_WITH_MUSA)
+  return musaPeekAtLastError();
 #else
   return cudaPeekAtLastError();
 #endif
@@ -240,6 +264,12 @@ inline hipError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                    const int64_t num_col,
                                                    const SourceType* in,
                                                    SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
+                                                   const int64_t num_row,
+                                                   const int64_t num_col,
+                                                   const SourceType* in,
+                                                   SourceType* out) {                                                    
 #else
 inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
                                                     const int64_t num_row,
@@ -271,6 +301,13 @@ inline cudaError_t DispatchLogsumexpWarpWithPadding(const Context& dev_ctx,
 template <typename T, typename SourceType, typename Context, int VecSize>
 #if PADDLE_WITH_HIP
 typename std::enable_if<VecSize == 1, hipError_t>::type
+DispatchLogsumexpWarpCols(const Context& dev_ctx,
+                          const int64_t num_row,
+                          const int64_t num_col,
+                          const SourceType* in,
+                          SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)                            
+typename std::enable_if<VecSize == 1, musaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_row,
                           const int64_t num_col,
@@ -287,6 +324,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -367,6 +406,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
@@ -380,6 +421,13 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
                           const int64_t num_col,
                           const SourceType* in,
                           SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)        
+typename std::enable_if<VecSize == 2, musaError_t>::type
+DispatchLogsumexpWarpCols(const Context& dev_ctx,
+                          const int64_t num_row,
+                          const int64_t num_col,
+                          const SourceType* in,
+                          SourceType* out) {                    
 #else
 typename std::enable_if<VecSize == 2, cudaError_t>::type
 DispatchLogsumexpWarpCols(const Context& dev_ctx,
@@ -391,6 +439,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
   if (num_col <= 0) {
 #if PADDLE_WITH_HIP
     return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+    return musaErrorInvalidValue;
 #else
     return cudaErrorInvalidValue;
 #endif
@@ -455,6 +505,8 @@ DispatchLogsumexpWarpCols(const Context& dev_ctx,
 #undef HANDLE_COL
 #if PADDLE_WITH_HIP
   return hipErrorInvalidValue;
+#elif defined(PADDLE_WITH_MUSA)
+  return musaErrorInvalidValue;
 #else
   return cudaErrorInvalidValue;
 #endif
@@ -467,6 +519,12 @@ inline hipError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                         const int64_t num_col,
                                         const SourceType* in,
                                         SourceType* out) {
+#elif defined(PADDLE_WITH_MUSA)
+inline musaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
+                                        const int64_t num_row,
+                                        const int64_t num_col,
+                                        const SourceType* in,
+                                        SourceType* out) {                                          
 #else
 inline cudaError_t DispatchLogsumexpWarp(const Context& dev_ctx,
                                          const int64_t num_row,
diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
index 85db2de74e6fdd..9faf835af8a987 100644
--- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
+++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP  // HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA)   // HIP not support cusolver
 
 #include <math.h>
 #include <algorithm>
diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu
index f509e0a173161b..895249b3b20f01 100644
--- a/paddle/phi/kernels/gpu/lu_kernel.cu
+++ b/paddle/phi/kernels/gpu/lu_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // HIP not support cusolver
 
 #include "paddle/phi/backends/dynload/cusolver.h"
diff --git a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
index 9727e2cc114c54..fb256713a768fb 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/matrix_rank_kernel.h"
diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
index 33de3c8e178767..c8761eabab8f03 100644
--- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
+++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/matrix_rank_tol_kernel.h"
diff --git a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
index 531e30a880a48b..fa9c67ae4acc4d 100644
--- a/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiclass_nms3_kernel.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
 #include "paddle/phi/kernels/multiclass_nms3_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu
index 635e9189b7d89a..f654f91adef818 100644
--- a/paddle/phi/kernels/gpu/multinomial_kernel.cu
+++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/multinomial_kernel.h"
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
@@ -103,9 +103,9 @@ __global__ void sampleMultinomialWithReplacement(
   size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x +
                threadIdx.x;
 
-#if defined(__NVCC__)
-  curandStatePhilox4_32_10_t state;
-  curand_init(seed, idx, offset, &state);
+#if defined(__MUSACC__)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx, offset, &state);
 #else
   hiprandStatePhilox4_32_10_t state;
   hiprand_init(seed, idx, offset, &state);
@@ -114,8 +114,8 @@ __global__ void sampleMultinomialWithReplacement(
   int sample = blockIdx.x * blockDim.x + threadIdx.x;
   for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
     if (sample < num_samples) {
-#if defined(__NVCC__)
-      T rng_number = static_cast<T>(curand_uniform4(&state).x);
+#if defined(__MUSACC__)
+      T rng_number = static_cast<T>(murand_uniform4(&state).x);
 #else
       T rng_number = static_cast<T>(hiprand_uniform4(&state).x);
 #endif
diff --git a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
index 7895983236f91d..4e5a2942d6b3bc 100644
--- a/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_grad_kernel.cu
@@ -36,6 +36,8 @@ void NllLossGradKernel(const Context& dev_ctx,
   auto total_weight_data = total_weight.data<T>();
 #ifdef PADDLE_WITH_HIP
   hipMemset(dx_data, 0, dx->numel() * sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #else
   cudaMemset(dx_data, 0, dx->numel() * sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/nll_loss_kernel.cu b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
index 1e80eb9bb460e1..b3da3675d00414 100644
--- a/paddle/phi/kernels/gpu/nll_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/nll_loss_kernel.cu
@@ -37,6 +37,8 @@ void NllLossRawKernel(const Context& dev_ctx,
   auto weight_data = weight.get_ptr() ? weight.get_ptr()->data<T>() : nullptr;
 #ifdef PADDLE_WITH_HIP
   hipMemset(total_weight_data, 0, sizeof(T));
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(total_weight_data, 0, sizeof(T));  
 #else
   cudaMemset(total_weight_data, 0, sizeof(T));
 #endif
diff --git a/paddle/phi/kernels/gpu/nonzero_kernel.cu b/paddle/phi/kernels/gpu/nonzero_kernel.cu
index 65cdcd3d6a058d..27c89538687f1a 100644
--- a/paddle/phi/kernels/gpu/nonzero_kernel.cu
+++ b/paddle/phi/kernels/gpu/nonzero_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/nop_kernel.cu b/paddle/phi/kernels/gpu/nop_kernel.cu
index 7bdf7be4d58dec..6e392afae0727f 100644
--- a/paddle/phi/kernels/gpu/nop_kernel.cu
+++ b/paddle/phi/kernels/gpu/nop_kernel.cu
@@ -16,6 +16,6 @@
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(nop, GPU, ALL_LAYOUT, phi::NopKernel, float) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index 2196c714542462..c0bac0d64a0773 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/norm_grad_kernel.h"
 
 #include <algorithm>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index 7e519bcc2804cf..bc6e69555ec92e 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/norm_kernel.h"
 
 #include <algorithm>
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/p_recv_kernel.cu b/paddle/phi/kernels/gpu/p_recv_kernel.cu
index b6fd090173260f..a6ae0406050ef8 100644
--- a/paddle/phi/kernels/gpu/p_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_recv_kernel.cu
@@ -21,15 +21,14 @@
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_MCCL)||defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
 namespace phi {
 
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL))
 template <typename Context>
 DDim recv_shape_info(const Context& dev_ctx,
                      phi::DenseTensor* out,
@@ -42,7 +41,7 @@ DDim recv_shape_info(const Context& dev_ctx,
                         "NCCLComm and Stream should be provided if use NCCL "
                         "to send the shape info."));
   paddle::DataType shape_dtype = paddle::DataType::INT32;
-  ncclDataType_t nccl_dtype = ncclInt;
+  mcclDataType_t nccl_dtype = mcclInt;
 
   // phi::DenseTensor gpu_shape_size_tensor(shape_dtype);
   phi::DenseTensor* gpu_shape_size_tensor = new phi::DenseTensor(shape_dtype);
@@ -130,8 +129,8 @@ void PRecvKernel(const Context& dev_ctx,
                  DataType dtype,
                  bool dynamic_shape,
                  DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL)  
 
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
@@ -156,8 +155,8 @@ void PRecvArrayKernel(const Context& dev_ctx,
                       DataType dtype,
                       const std::vector<int>& out_shape,
                       TensorArray* out_array) {
-#if defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_MCCL)  || defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) 
 
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
diff --git a/paddle/phi/kernels/gpu/p_send_kernel.cu b/paddle/phi/kernels/gpu/p_send_kernel.cu
index efbb69afcdab75..15b87ee056a5df 100644
--- a/paddle/phi/kernels/gpu/p_send_kernel.cu
+++ b/paddle/phi/kernels/gpu/p_send_kernel.cu
@@ -21,15 +21,14 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/utils/data_type.h"
 
-#if defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
 namespace phi {
 
-#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
-    NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_MCCL)|| (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL))
 template <typename Context>
 void send_shape_info(const Context& dev_ctx,
                      const DenseTensor& x,
@@ -42,7 +41,7 @@ void send_shape_info(const Context& dev_ctx,
                         "NCCLComm and Stream should be provided if use NCCL "
                         "to send the shape info."));
   paddle::DataType shape_dtype = paddle::DataType::INT32;
-  ncclDataType_t nccl_dtype = ncclInt;
+  mcclDataType_t nccl_dtype = mcclInt;
   auto dims = x.dims();
   int shape_size = dims.size();
 
@@ -124,8 +123,8 @@ void PSendKernel(const Context& dev_ctx,
                  const DenseTensor& x,
                  int peer,
                  bool dynamic_shape) {
-#if defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL)
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
   if (dynamic_shape) {
@@ -144,8 +143,8 @@ template <typename T, typename Context>
 void PSendArrayKernel(const Context& dev_ctx,
                       const TensorArray& x_array,
                       int peer) {
-#if defined(PADDLE_WITH_NCCL) || \
-    defined(PADDLE_WITH_RCCL) && NCCL_VERSION_CODE >= 2703
+#if defined(PADDLE_WITH_MCCL) || defined(PADDLE_WITH_NCCL) || \
+    defined(PADDLE_WITH_RCCL) 
 
   auto comm_ctx = GetCommContext(dev_ctx, peer);
   gpuStream_t stream = dev_ctx.stream();
@@ -153,7 +152,7 @@ void PSendArrayKernel(const Context& dev_ctx,
     VLOG(3) << "LodTensorArray: idx(" << idx << ")";
     auto x = x_array.at(idx);
     int numel = x.numel();
-    ncclDataType_t dtype = ToNCCLDataType(x.type());
+    mcclDataType_t dtype = ToNCCLDataType(x.type());
     comm_ctx->Send(x, x.numel(), peer, stream);
     VLOG(3) << "rank " << comm_ctx->GetRank() << " send "
             << common::product(x.dims()) << " to " << peer;
diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu
index 1d1968b30ae6ef..c73c8a9a23f09b 100644
--- a/paddle/phi/kernels/gpu/poisson_kernel.cu
+++ b/paddle/phi/kernels/gpu/poisson_kernel.cu
@@ -18,7 +18,9 @@ limitations under the License. */
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 #endif
-
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#endif
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -31,14 +33,14 @@ template <typename T>
 __global__ void GetPoisson(
     const T* in, T* out, const int N, unsigned int seed, unsigned int offset) {
   CUDA_KERNEL_LOOP_TYPE(idx, N, int64_t) {
-#ifdef __NVCC__
-    curandStatePhilox4_32_10_t state;
-    curand_init(seed, idx, offset, &state);
-    out[idx] = static_cast<T>(curand_poisson(&state, in[idx]));
-#elif __HIPCC__
-    hiprandStatePhilox4_32_10_t state;
-    hiprand_init(seed, idx, offset, &state);
-    out[idx] = static_cast<T>(hiprand_poisson(&state, in[idx]));
+#ifdef __MUSACC__
+    murandStatePhilox4_32_10_t state;
+    murand_init(seed, idx, offset, &state);
+    out[idx] = static_cast<T>(murand_poisson(&state, in[idx]));
+#elif __MUSACC__
+    murandStatePhilox4_32_10_t state;
+    murand_init(seed, idx, offset, &state);
+    out[idx] = static_cast<T>(murand_poisson(&state, in[idx]));
 #endif
   }
 }
diff --git a/paddle/phi/kernels/gpu/qr_kernel.cu b/paddle/phi/kernels/gpu/qr_kernel.cu
index 5bbb2ef158aa1a..639c2c4632cda4 100644
--- a/paddle/phi/kernels/gpu/qr_kernel.cu
+++ b/paddle/phi/kernels/gpu/qr_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP  // HIP not support cusolver
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA)   // HIP not support cusolver
 
 #include <thrust/device_vector.h>
 #include <algorithm>
diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu
index f439336cc1e709..9cccf80c3cc4f3 100644
--- a/paddle/phi/kernels/gpu/randperm_kernel.cu
+++ b/paddle/phi/kernels/gpu/randperm_kernel.cu
@@ -13,12 +13,17 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/randperm_kernel.h"
-
+#ifdef __MUSACC__
+#include <murand_kernel.h>
+#include "cub/cub.cuh"
+#endif
 #ifdef __NVCC__
 #include <curand_kernel.h>
 
 #include "cub/cub.cuh"
 #endif
+
+
 #ifdef __HIPCC__
 #include <hiprand_kernel.h>
 
@@ -71,11 +76,11 @@ __global__ void SwapRepeatKernel(keyT* key_out_data,
   curand_init(seed, idx, offset, &state);
   for (int i = repeat_size - 1; i > 0; i--) {
     uint32_t r = curand(&state) % (i + 1);
-#elif __HIPCC__
-  hiprandStatePhilox4_32_10_t state;
-  hiprand_init(seed, idx, offset, &state);
+#elif defined(__MUSACC__)
+  murandStatePhilox4_32_10_t state;
+  murand_init(seed, idx, offset, &state);
   for (int i = repeat_size - 1; i > 0; i--) {
-    uint32_t r = hiprand(&state) % (i + 1);
+    uint32_t r = murand(&state) % (i + 1);
 #endif
     if (r != i) {
       dataT tmp = out_data[idx + i];
diff --git a/paddle/phi/kernels/gpu/reduce.h b/paddle/phi/kernels/gpu/reduce.h
index 79c7381edab192..c74ce697da35ca 100644
--- a/paddle/phi/kernels/gpu/reduce.h
+++ b/paddle/phi/kernels/gpu/reduce.h
@@ -16,7 +16,7 @@
 
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
-    defined(PADDLE_WITH_XPU_KP)
+    defined(PADDLE_WITH_XPU_KP) || defined(PADDLE_WITH_MUSA) 
 
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/paddle/phi/kernels/gpu/reduce_grad.h b/paddle/phi/kernels/gpu/reduce_grad.h
index 0a01fe1ff1aab4..943e32c96dd929 100644
--- a/paddle/phi/kernels/gpu/reduce_grad.h
+++ b/paddle/phi/kernels/gpu/reduce_grad.h
@@ -15,7 +15,7 @@
 #pragma once
 
 // CUDA and HIP use same api
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #include <algorithm>
 #include <cmath>
diff --git a/paddle/phi/kernels/gpu/reduce_kernel.cu b/paddle/phi/kernels/gpu/reduce_kernel.cu
index 51b50ed6e00248..dc0f09240ec2e3 100644
--- a/paddle/phi/kernels/gpu/reduce_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_kernel.cu
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/reduce_kernel.h"
 
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/gpu/reduce_amin_amax_common.h"
 #include "paddle/phi/kernels/reduce_amin_grad_kernel.h"
 #include "paddle/phi/kernels/reduce_max_grad_kernel.h"
@@ -32,7 +33,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -241,7 +242,7 @@ void ReduceKernel(const Context& dev_ctx,
       x.numel(),
       0,
       phi::errors::InvalidArgument("Tensor need be reduced must not empty."));
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   out->Resize(x.dims());
   dev_ctx.template Alloc<T>(out);
 
@@ -256,20 +257,22 @@ void ReduceKernel(const Context& dev_ctx,
   PADDLE_ENFORCE_NOT_NULL(stream,
                           errors::NotFound("Should initialize NCCL firstly."));
 
-  ncclRedOp_t red_type = ncclSum;
+  mcclRedOp_t red_type = mcclSum;
   switch (static_cast<ReduceType>(reduce_type)) {
     case ReduceType::kRedSum:
-      red_type = ncclSum;
+      red_type = mcclSum;
       break;
     case ReduceType::kRedMax:
-      red_type = ncclMax;
+      red_type = mcclMax;
       break;
     case ReduceType::kRedMin:
-      red_type = ncclMin;
+      red_type = mcclMin;
       break;
     case ReduceType::kRedProd:
-      red_type = ncclProd;
+      red_type = mcclProd;
       break;
+    default:
+      PADDLE_ENFORCE(false, "not supported!");
   }
   comm_ctx->Reduce(out, x, red_type, root, stream);
 #else
diff --git a/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu b/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu
index 68cf339ada75b8..54ead64dc047b2 100644
--- a/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu
+++ b/paddle/phi/kernels/gpu/reduce_scatter_kernel.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
 #include "paddle/phi/core/distributed/nccl_comm_context.h"
 #endif
 
@@ -28,7 +28,7 @@ void ReduceScatterKernel(const Context& dev_ctx,
                          const DenseTensor& x,
                          int nranks,
                          DenseTensor* out) {
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_MCCL)
   gpuStream_t stream = dev_ctx.stream();
   auto comm_context =
       static_cast<distributed::NCCLCommContext*>(dev_ctx.GetCommContext());
@@ -50,7 +50,7 @@ void ReduceScatterKernel(const Context& dev_ctx,
 
   out->Resize(out_dims);
   dev_ctx.template Alloc<T>(out);
-  comm_context->ReduceScatter(out, x, ncclSum, stream);
+  comm_context->ReduceScatter(out, x, mcclSum, stream);
 #else
   PADDLE_THROW(
       errors::PreconditionNotMet("PaddlePaddle should compile with GPU."));
diff --git a/paddle/phi/kernels/gpu/rms_norm_kernel.cu b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
index da8bce8996b9e3..8bb4226fe4f4ff 100644
--- a/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/rms_norm_kernel.cu
@@ -38,8 +38,9 @@ limitations under the License.
 #include <assert.h>
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 #include <cub/cub.cuh>
 #endif
 
@@ -47,7 +48,7 @@ namespace phi {
 
 namespace {
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
 constexpr int kWarpSize = 32;
 
@@ -949,8 +950,8 @@ void RmsNormKernel(const Context& dev_ctx,
                    const float quant_min_bound,
                    DenseTensor* out,
                    DenseTensor* residual_out) {
-#if defined(PADDLE_WITH_HIP)
-  LOG(ERROR) << "Please compile with CUDA, ROCM platform isn't support it";
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
+  PADDLE_ENFORCE(false, "not supported");
 #else
   using ComputeType = typename phi::dtype::MPTypeTrait<T>::Type;
 
diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h
index 359218bbcb75f1..82902b9df40ba5 100644
--- a/paddle/phi/kernels/gpu/rnn_functor.h
+++ b/paddle/phi/kernels/gpu/rnn_functor.h
@@ -25,6 +25,10 @@ namespace phi {
 using gpuRNNMode_t = miopenRNNMode_t;
 using gpuDnnHandle_t = miopenHandle_t;
 using gpuDnnDataType_t = miopenDataType_t;
+#elif defined(PADDLE_WITH_MUSA)
+// using gpuRNNMode_t = mudnnRNNMode_t;
+// using gpuDnnHandle_t = mudnnHandle_t;
+// using gpuDnnDataType_t = mudnnDataType_t;
 #else
 using gpuRNNMode_t = cudnnRNNMode_t;
 using gpuDnnHandle_t = cudnnHandle_t;
@@ -113,6 +117,20 @@ class RNNDescriptors {
                              dropout_state,
                              seed_,
                              state_size);
+#elif defined(PADDLE_WITH_MUSA)  
+    if (!is_initialized) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::mudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state->Resize({static_cast<int64_t>(state_size)});
+      dev_ctx.template Alloc<uint8_t>(dropout_state);
+    }
+    dropout_desc_.descriptor(handle,  // NOLINT
+                             dev_ctx.GetPlace(),
+                             is_initialized,
+                             dropout_prob_,
+                             dropout_state,
+                             seed_,
+                             state_size);            
 #else
     // Note(lvyongkang): delete `is_initialized` in condition, cause this will
     // lead to bug in PIR mode, where rnn op has an input named
@@ -148,6 +166,18 @@ class RNNDescriptors {
         miopenRNNwithBias,
         miopenRNNdefault,
         cudnn_type));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnSetRNNDescriptor_V2(
+        rnn_desc_.desc(),
+        hidden_size_,
+        num_layers_,
+        dropout_desc_.desc(),
+        mudnnRNNlinear,
+        is_bidirec_ ? mudnnRNNbidirection : mudnnRNNunidirection,
+        mode_,
+        mudnnRNNwithBias,
+        mudnnRNNdefault,
+        cudnn_type));
 #elif CUDNN_VERSION >= 6000
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor_v6(
         handle,
@@ -184,6 +214,9 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNParamsSize(
+        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize(
         handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
@@ -208,6 +241,15 @@ class RNNDescriptors {
                                                 workspace_size));
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize(
         handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::mudnnGetRNNWorkspaceSize(handle,
+                                                rnn_desc_.desc(),
+                                                seq_length_,
+                                                x_descs_.data(),
+                                                workspace_size));
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::mudnnGetRNNTrainingReserveSize(
+        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size));
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(
         phi::dynload::cudnnGetRNNWorkspaceSize(handle,
@@ -229,6 +271,16 @@ class RNNDescriptors {
   miopenRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
   miopenDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
   miopenTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
+#elif defined(PADDLE_WITH_MUSA)
+  mudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
+  mudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
+  mudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
+  mudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
+  mudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
+  mudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
+  mudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
+  mudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
+  mudnnTensorDescriptor_t weight_desc() { return weight_desc_.desc(); }
 #else
   cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
   cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
@@ -260,6 +312,9 @@ class RNNDescriptors {
 #ifdef PADDLE_WITH_HIP
   std::vector<miopenTensorDescriptor_t> x_descs_;
   std::vector<miopenTensorDescriptor_t> y_descs_;
+#elif defined(PADDLE_WITH_MUSA)
+  std::vector<mudnnTensorDescriptor_t> x_descs_;
+  std::vector<mudnnTensorDescriptor_t> y_descs_;
 #else
   std::vector<cudnnTensorDescriptor_t> x_descs_;
   std::vector<cudnnTensorDescriptor_t> y_descs_;
diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
index 82800607bae9de..4de5f0f0ce0b65 100644
--- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc
@@ -246,7 +246,7 @@ void RnnKernel(const Context &dev_ctx,
     WeightToTensor<T>(place, stream, weight_list, &weight_whole);
 #endif
     w_data = weight_whole.data<T>();
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
     // MIOPEN need to permute weight, do not share with weight_grad
     if (is_test) {  // maybe also reset small weights' ptr for training
       int offset = 0;
diff --git a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
index b9c4a8daf2326c..3692504ce9c82b 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_grad_kernel.cu
@@ -49,6 +49,8 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 
 #ifdef PADDLE_WITH_HIP
   hipMemset(p_output, 0, memset_bytes);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemset(p_output, 0, memset_bytes);
 #else
   cudaMemset(p_output, 0, memset_bytes);
 #endif
diff --git a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
index d4a08a72d80a98..b4b03e2d11402d 100644
--- a/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_u_recv_kernel.cu
@@ -124,6 +124,8 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
 
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_dst_count, 0, input_size * sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemsetAsync(p_dst_count, 0, input_size * sizeof(int), ctx.stream());
 #else
     cudaMemsetAsync(p_dst_count, 0, input_size * sizeof(int), ctx.stream());
 #endif
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
index 5703b5faea07c5..0dbecef256c469 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
@@ -162,6 +162,12 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)    
+        musaMemcpyAsync(x_grad,
+                        x_grad_out.data<T>(),
+                        x_grad_out.numel() * sizeof(T),
+                        musaMemcpyDeviceToDevice,
+                        ctx.stream());   
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -240,6 +246,12 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)     
+        musaMemcpyAsync(x_grad,
+                        x_grad_out.data<T>(),
+                        x_grad_out.numel() * sizeof(T),
+                        musaMemcpyDeviceToDevice,
+                        ctx.stream());             
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -287,6 +299,12 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)      
+        musaMemcpyAsync(x_grad,
+                        x_grad_out.data<T>(),
+                        x_grad_out.numel() * sizeof(T),
+                        musaMemcpyDeviceToDevice,
+                        ctx.stream());            
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -357,6 +375,12 @@ void CalculateXGrad(const Context& ctx,
                   x_grad_out.data<T>(),
                   x_grad_out.numel() * sizeof(T),
                   hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)             
+        musaMemcpyAsync(x_grad,
+                        x_grad_out.data<T>(),
+                        x_grad_out.numel() * sizeof(T),
+                        musaMemcpyDeviceToDevice,
+                        ctx.stream());     
 #else
         cudaMemcpyAsync(x_grad,
                         x_grad_out.data<T>(),
@@ -493,6 +517,9 @@ void GraphSendUERecvGradOpCUDAKernelLaunchHelper(
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(e_grad_data, 0, memset_bytes_e);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
+  musaMemsetAsync(e_grad_data, 0, memset_bytes_e, ctx.stream());
 #else
   cudaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
   cudaMemsetAsync(e_grad_data, 0, memset_bytes_e, ctx.stream());
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
index c87f133d07b8d8..d25e7e2e474270 100644
--- a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -144,6 +144,9 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& ctx,
       int* dst_count_data = dst_count->data<int>();
 #ifdef PADDLE_WITH_HIP
       hipMemset(dst_count_data, 0, input_size * sizeof(int));
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemsetAsync(
+          dst_count_data, 0, input_size * sizeof(int), ctx.stream());
 #else
       cudaMemsetAsync(
           dst_count_data, 0, input_size * sizeof(int), ctx.stream());
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
index bc61ae766d6c24..a041588c076e6c 100644
--- a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -116,6 +116,12 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpyAsync(x_grad,
+                      x_grad_out.data<T>(),
+                      x_grad_out.numel() * sizeof(T),
+                      musaMemcpyDeviceToDevice,
+                      ctx.stream());                
 #else
       cudaMemcpyAsync(x_grad,
                       x_grad_out.data<T>(),
@@ -199,6 +205,12 @@ void CalculateGrad(const Context& ctx,
                 x_grad_out.data<T>(),
                 x_grad_out.numel() * sizeof(T),
                 hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+      musaMemcpyAsync(x_grad,
+                x_grad_out.data<T>(),
+                x_grad_out.numel() * sizeof(T),
+                musaMemcpyDeviceToDevice,
+                ctx.stream());        
 #else
       cudaMemcpyAsync(x_grad,
                       x_grad_out.data<T>(),
@@ -249,6 +261,9 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
 #ifdef PADDLE_WITH_HIP
   hipMemset(x_grad_data, 0, memset_bytes_x);
   hipMemset(y_grad_data, 0, memset_bytes_y);
+#elif defined(PADDLE_WITH_MUSA)
+  musaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
+  musaMemsetAsync(y_grad_data, 0, memset_bytes_y, ctx.stream());
 #else
   cudaMemsetAsync(x_grad_data, 0, memset_bytes_x, ctx.stream());
   cudaMemsetAsync(y_grad_data, 0, memset_bytes_y, ctx.stream());
diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu
index d36687461bf6c1..d86b7cfddcd78a 100644
--- a/paddle/phi/kernels/gpu/sgd_kernel.cu
+++ b/paddle/phi/kernels/gpu/sgd_kernel.cu
@@ -197,6 +197,22 @@ PD_REGISTER_KERNEL(sgd,
 }
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+PD_REGISTER_KERNEL(sgd,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SGDDenseKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   float,
+                   double) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
+
 #ifdef PADDLE_WITH_HIP
 PD_REGISTER_KERNEL(sgd,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
index 9472861a64c8e3..33770fa691ac14 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifndef _MSC_VER
 #include <thrust/device_ptr.h>
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
index 8be83b12d20d43..40beb945267104 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #ifndef _MSC_VER
 #include <thrust/device_ptr.h>
@@ -77,7 +77,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
                                                              dev_ctx.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(dev_ctx.stream());
 #else
-  const auto& exec_policy = thrust::hip::par.on(dev_ctx.stream());
+  const auto& exec_policy = thrust::musa::par.on(dev_ctx.stream());
 #endif
   thrust::random::default_random_engine engine(seed_int);
   thrust::counting_iterator<int64_t> cnt_iter(0);
diff --git a/paddle/phi/kernels/gpu/shuffle_batch_utils.h b/paddle/phi/kernels/gpu/shuffle_batch_utils.h
index 3a7c2230d3213b..2095bf35a32233 100644
--- a/paddle/phi/kernels/gpu/shuffle_batch_utils.h
+++ b/paddle/phi/kernels/gpu/shuffle_batch_utils.h
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
diff --git a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
index dc6d8312e06c7e..55268cb010ad69 100644
--- a/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
+++ b/paddle/phi/kernels/gpu/sigmoid_cross_entropy_with_logits.h
@@ -25,7 +25,8 @@
 #include "paddle/phi/kernels/funcs/math.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
+
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/strided_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
index 64cd37cd14b638..128c0b865c66a3 100644
--- a/paddle/phi/kernels/gpu/strided_copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
@@ -32,7 +32,7 @@ __global__ void StridedCopyCaseZeroFunc(
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride) {
   int64_t input_offset = 0;
   int64_t output_offset = 0;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) ||  defined(PADDLE_WITH_MUSA)
   int64_t coordinate[6] = {threadIdx.x,
                            threadIdx.y,
                            threadIdx.z,
@@ -467,7 +467,7 @@ __global__ void Strided2ContiguousCaseZeroFunc(
                               blockDim.z * blockDim.y * blockDim.x +
                           threadIdx.z * blockDim.y * blockDim.x +
                           threadIdx.y * blockDim.x + threadIdx.x;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined (PADDLE_WITH_MUSA)
   int64_t coordinate[6] = {threadIdx.x,
                            threadIdx.y,
                            threadIdx.z,
@@ -881,7 +881,7 @@ __global__ void Contiguous2StridedCaseZeroFunc(
                          threadIdx.z * blockDim.y * blockDim.x +
                          threadIdx.y * blockDim.x + threadIdx.x;
   int64_t output_offset = 0;
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined (PADDLE_WITH_MUSA)
   int64_t coordinate[6] = {threadIdx.x,
                            threadIdx.y,
                            threadIdx.z,
@@ -1339,6 +1339,11 @@ void StridedCopyKernel(const Context& dev_ctx,
               input_data,
               phi::SizeOf(input.dtype()),
               hipMemcpyDeviceToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemcpy(output_data,
+               input_data,
+               phi::SizeOf(input.dtype()),
+               musaMemcpyDeviceToDevice);          
 #else
     cudaMemcpy(output_data,
                input_data,
diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu
index 5f076850d438f2..6cebc883aeb403 100644
--- a/paddle/phi/kernels/gpu/svd_kernel.cu
+++ b/paddle/phi/kernels/gpu/svd_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 // HIP not support cusolver
 
 #include "paddle/phi/kernels/svd_kernel.h"
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
index 528d3d07ad7849..eea67fc676d6df 100644
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -20,6 +20,10 @@
 #include <hiprand_kernel.h>
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;
+#elif defined(PADDLE_WITH_MUSA)
+#include <musa_fp16.h>
+#include <murand_kernel.h>
+#include <cub/cub.cuh>
 #else
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
@@ -135,6 +139,15 @@ __global__ void setup_kernel(hiprandState_t* state,
     hiprand_init(seed, i, 0, &state[i]);
   }
 }
+#elif defined(PADDLE_WITH_MUSA)
+__global__ void setup_kernel(murandState_t* state,
+                             const uint64_t seed,
+                             const int bs) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = idx; i < bs; i += gridDim.x * blockDim.x) {
+    murand_init(seed, i, 0, &state[i]);
+  }
+}
 #else
 __global__ void setup_kernel(curandState_t* state,
                              const uint64_t seed,
@@ -318,6 +331,8 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
                                      int vocab_size,
 #ifdef PADDLE_WITH_HIP
                                      hiprandState_t* state,
+#elif defined(PADDLE_WITH_MUSA)
+                                     murandState_t* state,
 #else
                                      curandState_t* state,
 #endif
@@ -368,6 +383,8 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
     count_iter_begin[bid] = count_iter[bid];
 #ifdef PADDLE_WITH_HIP
     float rand_top_p = hiprand_uniform(state + bid) * top_p_num;
+#elif defined(PADDLE_WITH_MUSA)
+    float rand_top_p = murand_uniform(state + bid) * top_p_num;
 #else
     float rand_top_p = curand_uniform(state + bid) * top_p_num;
 #endif
@@ -566,6 +583,10 @@ __global__ void topp_sampling(T* sorted_probs,
         hiprandStatePhilox4_32_10_t rng;
         hiprand_init(seed, tid, 0, &rng);
         int random_id = hiprand(&rng) % (max_id + 1);
+#elif defined(PADDLE_WITH_MUSA)
+        murandStatePhilox4_32_10_t rng;
+        murand_init(seed, tid, 0, &rng);
+        int random_id = murand(&rng) % (max_id + 1);
 #else
         curandStatePhilox4_32_10_t rng;
         curand_init(seed, tid, 0, &rng);
@@ -599,7 +620,7 @@ __global__ void set_sorted_num(int* need_sorted_num, int bs) {
   *need_sorted_num = bs;
 }
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T>
 __global__ void print_kernel(T* input, int size) {
   for (int i = 0; i < size; i++) {
@@ -697,6 +718,15 @@ void TopPSamplingKernel(const Context& dev_ctx,
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   dev_curand_states =
       reinterpret_cast<hiprandState_t*>(curand_states_buf->ptr());
+#elif defined(PADDLE_WITH_MUSA)
+  murandState_t* dev_curand_states;
+  phi::Allocator::AllocationPtr curand_states_buf{nullptr};
+  curand_states_buf = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      bs * sizeof(murandState_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  dev_curand_states =
+      reinterpret_cast<murandState_t*>(curand_states_buf->ptr());      
 #else
   curandState_t* dev_curand_states;
   phi::Allocator::AllocationPtr curand_states_buf{nullptr};
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index b408c5b2dd5b0d..ae50fec2ca1af5 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -136,7 +136,7 @@ UniqueFlattendCUDATensor(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::hip::par.on(context.stream());
+  const auto& exec_policy = thrust::musa::par.on(context.stream());
 #endif
 
   thrust::sequence(exec_policy, indices_data, indices_data + num_input);
@@ -172,6 +172,8 @@ UniqueFlattendCUDATensor(const Context& context,
                                 not_equal);
 #ifdef PADDLE_WITH_HIP
     hipMemset(inv_loc_data_ptr, 0, sizeof(IndexT));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemsetAsync(inv_loc_data_ptr, 0, sizeof(IndexT), context.stream());
 #else
     thrust::device_ptr<IndexT> inv_loc_data_dev(inv_loc_data_ptr);
     inv_loc_data_dev[0] = 0;  // without device_ptr, segmentation fault
@@ -269,7 +271,7 @@ UniqueFlattendCUDATensor(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::hip::par.on(context.stream());
+  const auto& exec_policy = thrust::musa::par.on(context.stream());
 #endif
   thrust::sequence(exec_policy, indices_data, indices_data + num_input);
   thrust::sort(exec_policy,
@@ -359,7 +361,7 @@ static void ComputeUniqueDims(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::hip::par.on(context.stream());
+  const auto& exec_policy = thrust::musa::par.on(context.stream());
 #endif
   // 1. inverse indices: 'inverse'
   inverse->Resize(common::make_ddim({row}));
@@ -465,7 +467,7 @@ static void UniqueDimsCUDATensor(const Context& context,
                                                              context.stream());
   const auto& exec_policy = thrust::cuda::par(allocator).on(context.stream());
 #else
-  const auto& exec_policy = thrust::hip::par.on(context.stream());
+  const auto& exec_policy = thrust::musa::par.on(context.stream());
 #endif
   thrust::sequence(exec_policy, sorted_indices_data, sorted_indices_data + row);
   thrust::sort(exec_policy,
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index f813223c2ce311..947fa68cfde52d 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/viterbi_decode_kernel.h"
 
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #endif
 #ifdef __HIPCC__
diff --git a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
index d4e0ca632e04de..1bc590e852555a 100644
--- a/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/weighted_sample_neighbors_kernel.cu
@@ -25,6 +25,12 @@
 #include "cub/cub.cuh"
 #endif
 
+#ifdef PADDLE_WITH_MUSA
+#include <musa_runtime.h>
+#include <murand_kernel.h>
+#include "cub/cub.cuh"
+#endif
+
 #include "math.h"  // NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
index 2a3c9515ac2ea7..d8b9762accc5b1 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_grad_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
 #include "paddle/phi/kernels/affine_grid_grad_kernel.h"
 #include "paddle/phi/backends/all_context.h"
diff --git a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
index bde4faefc5de3f..9d11b4149415af 100644
--- a/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/affine_grid_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef PADDLE_WITH_HIP
+#if !defined(PADDLE_WITH_HIP) &&  !defined(PADDLE_WITH_MUSA) 
 
 #include "paddle/phi/kernels/affine_grid_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
index 9c3afd188ad3c3..46174ac75ce076 100644
--- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
+++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h
@@ -1051,6 +1051,16 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx,
       out_data,
       algo,
       mode));
+#elif defined(PADDLE_WITH_MUSA)
+  auto& idesc = scoped_desc.descriptor<T>(x_data, layout, tensor_dims);
+  ScopedTensorDescriptor out_scoped_desc;
+  auto& odesc = out_scoped_desc.descriptor<T>(out_data, layout, tensor_dims);
+  backends::gpu::ScopedSoftmaxDescriptor softmax_desc;
+  auto mode = log_mode ? dynload::Softmax::Mode::LOGSOFTMAX
+                       : dynload::Softmax::Mode::SOFTMAX;
+  auto algo = log_mode ? dynload::Softmax::Algorithm::DIRECT
+                       : dynload::Softmax::Algorithm::ACCURATE;
+  softmax_desc.descriptor(mode, algo, axis).Run(*handle, odesc, idesc);      
 #else
   cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
   auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
@@ -1125,6 +1135,8 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx,
       dx_data,
       algo,
       mode));
+#elif defined(PADDLE_WITH_MUSA)
+  //      
 #else
   cudnnTensorDescriptor_t desc = scoped_desc.descriptor<T>(layout, tensor_dims);
   auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE
diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
index 20cc162b7db554..844a4e397aa077 100644
--- a/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel.cu
@@ -41,7 +41,7 @@ void SoftmaxGradGPUDNNKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(softmax_grad,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel.cu b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
index e62468b7fb1670..9ea183071d4ed3 100644
--- a/paddle/phi/kernels/gpudnn/softmax_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/softmax_kernel.cu
@@ -40,7 +40,7 @@ void SoftmaxGPUDNNKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(softmax,
                    GPUDNN,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
index 9acdeca0e67478..fb661781c18713 100644
--- a/paddle/phi/kernels/group_norm_kernel.h
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -38,7 +38,7 @@ void GroupNormKernel(const Context& dev_ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T, typename AccT = T>
 class GroupNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
index 821b065d2883ac..10de97df12a97b 100644
--- a/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_grad_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -47,7 +47,7 @@ void ClipGradKernel(const Context& dev_ctx,
   auto max_ = max.to<T>();
   auto min_ = min.to<T>();
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   std::vector<const DenseTensor*> ins = {&out_grad, &x};
   std::vector<DenseTensor*> outs = {x_grad};
   auto functor = ClipGradFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/clip_kernel_impl.h b/paddle/phi/kernels/impl/clip_kernel_impl.h
index 7d327ef5c5dfaf..dde2ef9ebeccea 100644
--- a/paddle/phi/kernels/impl/clip_kernel_impl.h
+++ b/paddle/phi/kernels/impl/clip_kernel_impl.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/clip_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
@@ -60,7 +60,7 @@ void ClipKernel(const Context& dev_ctx,
   const T* x_data = x.data<T>();
   int64_t numel = x.numel();
   if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     std::vector<const DenseTensor*> ins = {&x};
     std::vector<DenseTensor*> outs = {out};
     auto functor = ClipFunctor<T>(min_, max_);
diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h
index ebbbda04a01c20..52c0b634a61117 100644
--- a/paddle/phi/kernels/impl/complex_kernel_impl.h
+++ b/paddle/phi/kernels/impl/complex_kernel_impl.h
@@ -88,7 +88,7 @@ void ComplexKernel(const Context& dev_ctx,
 
 // NOTE(chenfeiyu): be careful of the caveats of calling elementwise-related
 // facility functions
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   phi::funcs::ElementwiseCompute<RealAndImagToComplexFunctor<T>, T, C>(
       dev_ctx, x, y, RealAndImagToComplexFunctor<T>(), out);
 #else
diff --git a/paddle/phi/kernels/impl/diag_embed_impl.h b/paddle/phi/kernels/impl/diag_embed_impl.h
index 044deccb3c2c35..53582077b3326c 100644
--- a/paddle/phi/kernels/impl/diag_embed_impl.h
+++ b/paddle/phi/kernels/impl/diag_embed_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -105,7 +105,7 @@ void DiagEmbedKernel(const Context& dev_ctx,
   strides.push_back(stride[dim1_] + stride[dim2_]);
   const auto dims = common::vectorize(x.dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   thrust::device_vector<int64_t> dims_vec(dims);
   const int64_t* dims_arr = thrust::raw_pointer_cast(dims_vec.data());
   thrust::device_vector<int64_t> strides_vec(strides);
diff --git a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
index 3a82ace22860e5..330ea8e63f1a7c 100644
--- a/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/dot_grad_kernel_impl.h
@@ -45,7 +45,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
     VLOG(1) << "enable route";
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
 
@@ -143,7 +143,7 @@ struct DotGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   const DenseTensor* tensor_dout,
                   DenseTensor* tensor_dx,
                   DenseTensor* tensor_dy) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto dout = EigenVector<T>::Flatten(*tensor_dout);
       if (tensor_dx) {
@@ -235,7 +235,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       DenseTensor tensor_dout_help;
       auto& dev = *ctx.eigen_device();
@@ -430,7 +430,7 @@ struct DotDoubleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
                   DenseTensor* tensor_ddout) {
     const DenseTensor* tensor_ddx = tensor_ddx_opt->get_ptr();
     const DenseTensor* tensor_ddy = tensor_ddy_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     if (1 >= tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       auto x = EigenVector<T>::Flatten(*tensor_x);
@@ -620,7 +620,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::EnableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       DenseTensor in_tensor_x_help = Conj<T, DeviceContext>(ctx, *in_tensor_x);
@@ -1014,7 +1014,7 @@ struct DotTripleGradFunction<DeviceContext, T, phi::funcs::DisableComplex<T>> {
     const DenseTensor* in_tensor_d_dx = in_tensor_d_dx_opt->get_ptr();
     const DenseTensor* in_tensor_d_dy = in_tensor_d_dy_opt->get_ptr();
     const DenseTensor* in_tensor_d_ddout = in_tensor_d_ddout_opt->get_ptr();
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     if (1 >= in_tensor_dout->dims().size()) {
       auto& dev = *ctx.eigen_device();
       bool d_dout_flag = false;
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 280c38633b4626..3989c9da0ed3be 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -903,7 +903,7 @@ void HeavisideGradKernel(const Context& dev_ctx,
           HeavisideGradDy<T>());
 }
 
-#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T, typename MPType>
 HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow_grad_dx(T x, T y, T out, T dout) {
diff --git a/paddle/phi/kernels/impl/elementwise_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
index 137829b5193f24..e4624b35c4b8ab 100644
--- a/paddle/phi/kernels/impl/elementwise_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__) || defined(__xpu__)
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #endif
 
diff --git a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
index 72c8bc659a632a..2f861e17968b07 100644
--- a/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fft_grad_kernel_impl.h
@@ -94,7 +94,7 @@ void FFTC2RGradKernel(const Context& ctx,
       out_grad.dims()[axes.back()] - x_grad->dims()[axes.back()];
   const phi::DDim strides = common::stride(x_grad->dims());
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   const thrust::device_vector<int64_t> strides_g(common::vectorize(strides));
   const int64_t* pstrides = thrust::raw_pointer_cast(strides_g.data());
 #else
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
index 93dfb7790b4abd..1a5a2b3bbccdb4 100644
--- a/paddle/phi/kernels/impl/isclose_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -121,7 +121,7 @@ struct IscloseFunctor<phi::CPUContext, phi::dtype::complex<T>> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T>
 __global__ void IscloseCUDAKernel(const T* in_data,
                                   const T* other_data,
@@ -232,6 +232,8 @@ struct IscloseFunctor<phi::GPUContext, T> {
     grid = (grid > block) ? block : grid;
 #ifdef PADDLE_WITH_HIP
     hipMemset(out_data, true, num * sizeof(bool));
+#elif defined(PADDLE_WITH_MUSA)
+    musaMemset(out_data, true, num * sizeof(bool));
 #else
     cudaMemset(out_data, true, num * sizeof(bool));
 #endif
diff --git a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
index 3b195d6fa8b0ad..4907ad4709215e 100644
--- a/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_grad_kernel_impl.h
@@ -178,7 +178,7 @@ struct KronGradOpFunctor {
     const int64_t *p_stride_y = nullptr;
     const int64_t *p_stride_dout = nullptr;
     const int64_t *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_dout(ndims);
@@ -232,7 +232,7 @@ struct KronGradOpFunctor {
     for_range(func);
 
 // reduce_sum along aixs 1
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     auto stream = dev_ctx.stream();  // it is a cuda device_context
     if (dx) {
       phi::SumKernel<T, Context>(
diff --git a/paddle/phi/kernels/impl/kron_kernel_impl.h b/paddle/phi/kernels/impl/kron_kernel_impl.h
index e90c45c01879fc..02b96170508557 100644
--- a/paddle/phi/kernels/impl/kron_kernel_impl.h
+++ b/paddle/phi/kernels/impl/kron_kernel_impl.h
@@ -20,7 +20,7 @@
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "thrust/device_vector.h"
 #endif
@@ -117,7 +117,7 @@ struct KronOpFunctor {
 
     const int64_t *p_stride_x = nullptr, *p_stride_y = nullptr,
                   *p_stride_out = nullptr, *p_shape_y = nullptr;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     thrust::device_vector<int64_t> d_stride_x(ndims);
     thrust::device_vector<int64_t> d_stride_y(ndims);
     thrust::device_vector<int64_t> d_stride_out(ndims);
diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
index 40ff69c50f1d7f..26463aadfbf132 100644
--- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -54,7 +54,7 @@ struct ReduceSumForMatmulGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T>
 struct ReduceSumForMatmulGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/polygamma_kernel_impl.h b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
index 8b4274b0882c84..8849014411650d 100644
--- a/paddle/phi/kernels/impl/polygamma_kernel_impl.h
+++ b/paddle/phi/kernels/impl/polygamma_kernel_impl.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #else
 #include "paddle/phi/kernels/funcs/for_range.h"
@@ -25,7 +25,7 @@ limitations under the License. */
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T>
 __host__ __device__ T zeta(T x, T q) {
   /*
diff --git a/paddle/phi/kernels/impl/pool_kernel_impl.h b/paddle/phi/kernels/impl/pool_kernel_impl.h
index dc0b7ad2108ac5..00e5a08e69d0ea 100644
--- a/paddle/phi/kernels/impl/pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/pool_kernel_impl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/pool_kernel.h"
 
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__NVCC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #endif
@@ -114,7 +114,7 @@ void PoolRawKernel(const Context& ctx,
         int reduce_num = GetReduceNum(x, out, channel_last, &reduce_dim);
         if (reduce_num > 0 &&
             adaptive) {  // for adaptive_avg_pool2d && output_size == 1
-#if defined(__HIPCC__) || defined(__NVCC__)
+#if defined(__HIPCC__)  || defined(__MUSACC__) || defined(__NVCC__)
           auto stream = ctx.stream();
           funcs::ReduceKernel<T, T, kps::AddFunctor, kps::DivideFunctor<T>>(
               ctx, x, out, kps::DivideFunctor<T>(reduce_num), reduce_dim);
diff --git a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
index 1948c928733e64..e6a4180b7bb9eb 100644
--- a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
+++ b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
@@ -76,7 +76,7 @@ void QuantLinearKernel(const Context& dev_ctx,
           "The weight's datatype is expected to be int8 when use quant. But "
           "received weight's datatype is %d",
           static_cast<int>(w.dtype())));
-#ifdef PADDLE_WITH_HIP
+#if defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   PADDLE_THROW(
       phi::errors::Unimplemented("FCInt8Functor not surpport for rocm"));
 #else
diff --git a/paddle/phi/kernels/impl/renorm_impl.h b/paddle/phi/kernels/impl/renorm_impl.h
index 409c0a5c4e1f31..d49cc520058fd1 100644
--- a/paddle/phi/kernels/impl/renorm_impl.h
+++ b/paddle/phi/kernels/impl/renorm_impl.h
@@ -17,11 +17,11 @@
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
@@ -150,7 +150,7 @@ void RenormGradFunc(const phi::CPUContext& ctx UNUSED,
   }
 }
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
diff --git a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
index d8c56000639bbc..7a1e01ae410165 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_grad_kernel_impl.h
@@ -18,10 +18,10 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_grad_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
-#ifdef __NVCC__
+#if defined(__NVCC__) || defined(__MUSACC__)
 #include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
@@ -33,7 +33,7 @@ namespace cub = hipcub;
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, typename IndexT>
@@ -104,7 +104,7 @@ void RepeatInterleaveWithTensorIndexGradKernel(
                         DataTypeToString(index_type),
                         DataTypeToString(DataType::INT32),
                         DataTypeToString(DataType::INT64)));
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 
   auto output_dim = out_grad.dims();
   auto stride_dim = common::stride(input_dim);
@@ -179,7 +179,7 @@ void RepeatInterleaveGradKernel(const Context& ctx,
   }
 
   DenseTensor index;
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   auto output_dim = out_grad.dims();
   auto stride_dim = common::stride(input_dim);
   int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
index 05f1bba3c0ea68..eed9acdb85cf32 100644
--- a/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
+++ b/paddle/phi/kernels/impl/repeat_interleave_kernel_impl.h
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/index_select_impl.h"
 #include "paddle/phi/kernels/repeat_interleave_kernel.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
@@ -29,7 +29,7 @@
 
 namespace phi {
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 using phi::PADDLE_CUDA_NUM_THREADS;
 template <typename T, typename IndexT>
 __global__ void index_select_cuda_kernel(const T* input,
@@ -86,7 +86,7 @@ void RepeatInterleaveKernel(const Context& ctx,
     output_dim[dim] = index_size;
     out->Resize(common::make_ddim(output_dim));
     phi::IndexSelectInner<Context, T, int>(ctx, &x_copy, index, out, dim);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   } else {
     auto stride_dim = common::stride(input_dim);
     int64_t stride = stride_dim[dim];
@@ -165,7 +165,7 @@ void RepeatInterleaveWithTensorIndexKernel(const Context& ctx,
       out->Resize(common::make_ddim(output_dim));
       IndexSelectInner<Context, T, int64_t>(ctx, &x_copy, index, out, dim);
     }
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   } else {
     auto stride_dim = common::stride(input_dim);
     int64_t stride = stride_dim[dim];
diff --git a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
index 3b6f9998a00129..37644334cbb94f 100644
--- a/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
+++ b/paddle/phi/kernels/impl/segment_pool_kernel_impl.h
@@ -64,7 +64,7 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
     phi::funcs::SetConstant<Context, T> set_zero;
     set_zero(dev_ctx, out, static_cast<T>(0));
   }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   if (!cpu_place) {
     DenseTensor length;
     length.Resize(common::make_ddim({1}));
@@ -77,6 +77,11 @@ void SegmentKernelLaunchHelper(const Context& dev_ctx,
                                          segment_ids_ptr + num_indices - 1,
                                          sizeof(IndexT),
                                          hipMemcpyDeviceToHost));
+#elif defined(PADDLE_WITH_MUSA)
+    PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(length_data,
+                                         segment_ids_ptr + num_indices - 1,
+                                         sizeof(IndexT),
+                                         musaMemcpyDeviceToHost));                                         
 #else
     PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpy(length_data,
                                           segment_ids_ptr + num_indices - 1,
diff --git a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
index 20fc0bda1f9184..77088f0a42748c 100644
--- a/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
+++ b/paddle/phi/kernels/impl/sequence_mask_kernel_impl.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_ptr.h>
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
@@ -41,7 +41,7 @@ void SequenceMaskScalarKernel(const Context& ctx,
     if (x_numel == 0) {
       maxlen = 0;
     } else {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
       VLOG(10)
           << "SequenceMaskOp on GPU may be slow when maxlen is not provided.";
       maxlen = static_cast<int>(
diff --git a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
index fa25f2a0887972..61d8c8c189df46 100644
--- a/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_grad_kernel_impl.h
@@ -27,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/squeeze_kernel.h"
 #include "paddle/phi/kernels/unsqueeze_kernel.h"
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include "paddle/phi/kernels/gpu/reduce.h"
 #endif
 
@@ -56,7 +56,7 @@ struct ReduceSumForSolvelGrad<CPUContext, T> {
   }
 };
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 template <typename T>
 struct ReduceSumForSolvelGrad<GPUContext, T> {
   void operator()(const GPUContext& dev_ctx,
diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
index 964d5871bf9319..8910cadb4ab25b 100644
--- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #endif
@@ -120,7 +120,7 @@ void TraceGradKernel(const Context& ctx,
   int64_t diag_size = len2 < len1 ? len2 : len1;
   int64_t pos = std::abs(offset) * offset_stride;
   if (diag_size > 0) {
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
     thrust::device_vector<int64_t> output_vec(common::vectorize(output_stride));
     const int64_t* output_arr = thrust::raw_pointer_cast(output_vec.data());
     thrust::device_vector<int64_t> input_vec(common::vectorize(input_stride));
diff --git a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
index 0576742e349a83..dacaad9032f343 100644
--- a/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #endif
 namespace phi {
@@ -39,7 +39,7 @@ void UnStackGradKernel(const Context &dev_ctx,
   for (auto i = 0; i < axis; ++i) pre *= dim[i];
   for (auto i = axis; i < dim.size(); ++i) post *= dim[i];
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   int total_num = pre * n * post;
 
   thrust::device_vector<const T *> device_x_vec(x_datas);
diff --git a/paddle/phi/kernels/impl/unstack_kernel_impl.h b/paddle/phi/kernels/impl/unstack_kernel_impl.h
index 102126a1e3307f..9d9a59ea8cbdcf 100644
--- a/paddle/phi/kernels/impl/unstack_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unstack_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/stack_functor.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #endif
 
@@ -44,7 +44,7 @@ void UnStackKernel(const Context &dev_ctx,
   int total_num = dy->numel();
   int post = total_num / (n * pre);
 
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   thrust::device_vector<T *> device_dx_vec(dx_datas);
   auto dx_data_arr = device_dx_vec.data().get();
 #else
@@ -52,7 +52,7 @@ void UnStackKernel(const Context &dev_ctx,
 #endif
   phi::funcs::StackGradFunctorForRange(
       dev_ctx, dx_data_arr, dy_data, total_num, n, post);
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
   // Wait() must be called because device_dx_vec may be destructed before
   // kernel ends
   dev_ctx.Wait();
diff --git a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
index 80ccf6e21b5377..d812290aed5731 100644
--- a/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
+++ b/paddle/phi/kernels/impl/warprnnt_kernel_impl.h
@@ -139,7 +139,7 @@ class WarpRNNTFunctor {
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
       gpu = true;
 #else
       PADDLE_THROW(errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/is_empty_kernel.cc b/paddle/phi/kernels/is_empty_kernel.cc
index dadaa2132e95ed..78754083014660 100644
--- a/paddle/phi/kernels/is_empty_kernel.cc
+++ b/paddle/phi/kernels/is_empty_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(is_empty,
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(is_empty,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index 6de33dd78d2d00..12f06c21dc9415 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -164,7 +164,7 @@ void ElementwisePowKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(maximum,
                    KPS,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index 2fddcec2278c9a..ee8a324e09b4f5 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -30,7 +30,7 @@ void LayerNormKernel(const Context& ctx,
                      DenseTensor* mean,
                      DenseTensor* variance);
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 template <typename T, typename U>
 class LayerNormDirectCUDAFunctor {
  public:
diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc
index 49d69a23fedd12..62a6cbc8ea840e 100644
--- a/paddle/phi/kernels/memcpy_kernel.cc
+++ b/paddle/phi/kernels/memcpy_kernel.cc
@@ -117,7 +117,7 @@ void MemcpyKernel(const Context& dev_ctx,
       dev_ctx.HostAlloc(out, out->dtype());
       Copy(dev_ctx, x, CPUPlace(), true, out);
       break;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
     case 1: /* CUDAPlace */
       dev_ctx.Alloc(out, x.dtype());
       Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
@@ -162,7 +162,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(memcpy_h2d,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/npu_identity_kernel.cc b/paddle/phi/kernels/npu_identity_kernel.cc
index 89a0c63c8a4959..12d933af787337 100644
--- a/paddle/phi/kernels/npu_identity_kernel.cc
+++ b/paddle/phi/kernels/npu_identity_kernel.cc
@@ -62,7 +62,7 @@ PD_REGISTER_KERNEL(npu_identity,
                    bool,
                    phi::dtype::float16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(npu_identity,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 30c2636a2bde91..4672a228a12168 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -17,6 +17,12 @@
 #ifdef PADDLE_WITH_CUDA
 #include <cuda_fp16.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
+
+
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index a78045aa0dc7ca..d953e06bd2670e 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -20,6 +20,11 @@
 #ifdef PADDLE_WITH_HIP
 #include <hip/hip_fp16.h>
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include <musa_fp16.h>
+#endif
+
 #include "paddle/common/ddim.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/prod_kernel.cc b/paddle/phi/kernels/prod_kernel.cc
index ea3faaebd95829..4e5546ca0df01c 100644
--- a/paddle/phi/kernels/prod_kernel.cc
+++ b/paddle/phi/kernels/prod_kernel.cc
@@ -40,7 +40,7 @@ PD_REGISTER_KERNEL(prod_infer,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(prod_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index d6f88a596af3ac..db09ad92fab6e9 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     all, GPU, ALL_LAYOUT, phi::AllKernel, float, double, int, int64_t, bool) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/paddle/phi/kernels/reduce_amax_kernel.cc b/paddle/phi/kernels/reduce_amax_kernel.cc
index 87e432c5c20a7b..466d0497b2d8ee 100644
--- a/paddle/phi/kernels/reduce_amax_kernel.cc
+++ b/paddle/phi/kernels/reduce_amax_kernel.cc
@@ -34,7 +34,7 @@ void AMaxKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amax, CPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amax, GPU, ALL_LAYOUT, phi::AMaxKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_amin_kernel.cc b/paddle/phi/kernels/reduce_amin_kernel.cc
index a355da64230dcb..a30ab4a91956dd 100644
--- a/paddle/phi/kernels/reduce_amin_kernel.cc
+++ b/paddle/phi/kernels/reduce_amin_kernel.cc
@@ -34,7 +34,7 @@ void AMinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     amin, CPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     amin, GPU, ALL_LAYOUT, phi::AMinKernel, float, double, int, int64_t) {}
 #endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 076aacfa3ed82c..4de7e3efc8e0a2 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -36,7 +36,7 @@ PD_REGISTER_KERNEL(
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     any, GPU, ALL_LAYOUT, phi::AnyKernel, float, double, int, int64_t, bool) {
   kernel->OutputAt(0).SetDataType(phi::DataType::BOOL);
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 3ce0380f3a36a7..b08bfd04436ad7 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -43,7 +43,7 @@ PD_REGISTER_KERNEL(mean,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index 460bd361b3987c..0d7a7a4fa50b87 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -39,7 +39,7 @@ void MinKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(
     min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(min,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index 4f7b3a4a8c7ed9..1995bb2fc1d7bc 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/reverse_kernel.cc b/paddle/phi/kernels/reverse_kernel.cc
index 771acacedf0243..d8c8f5a9663763 100644
--- a/paddle/phi/kernels/reverse_kernel.cc
+++ b/paddle/phi/kernels/reverse_kernel.cc
@@ -61,7 +61,7 @@ PD_REGISTER_KERNEL(reverse_array,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(reverse_array,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/activation_kernel.cc b/paddle/phi/kernels/selected_rows/activation_kernel.cc
index 4a27d0763a235c..6bd55f701bb33d 100644
--- a/paddle/phi/kernels/selected_rows/activation_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/activation_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     sqrt_sr, CPU, ALL_LAYOUT, phi::sr::SqrtKernel, float, double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(square_sr,
                    GPU,
diff --git a/paddle/phi/kernels/selected_rows/assign_kernel.cc b/paddle/phi/kernels/selected_rows/assign_kernel.cc
index 081d85e68c959f..481f5f6fcf8521 100644
--- a/paddle/phi/kernels/selected_rows/assign_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/assign_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
   kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(assign_sr,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
index 74d2bdc0a673fa..84155e03ab5d23 100644
--- a/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/elementwise_multiply_kernel.cc
@@ -84,7 +84,7 @@ PD_REGISTER_KERNEL(multiply_sr,
                    complex64,
                    complex128) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(multiply_raw_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index 0a3b3ae62fe63d..35f30df7613195 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/selected_rows/full_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/common/bfloat16.h"
@@ -64,7 +64,7 @@ PD_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
@@ -114,7 +114,7 @@ PD_REGISTER_KERNEL(full_with_tensor_sr,
   kernel->InputAt(1).SetBackend(phi::Backend::CPU);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(full_with_tensor_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
index d68688a7e400a1..fc6048657c8099 100644
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/selected_rows/isfinite_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #endif
 #include "paddle/phi/core/kernel_registry.h"
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(isfinite_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(isinf_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
index a5d2e667873168..135eb3b8b04d90 100644
--- a/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/merge_selected_rows_kernel.cc
@@ -41,7 +41,7 @@ PD_REGISTER_KERNEL(merge_selected_rows,
                    float,
                    double) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(merge_selected_rows,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 38a0cb75101b7e..ad7ead679e462e 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -54,7 +54,7 @@ PD_REGISTER_KERNEL(scale_sr,
                    int,
                    int64_t) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/shape_kernel.cc b/paddle/phi/kernels/selected_rows/shape_kernel.cc
index f44a6a8dfafc50..859f8f1399db5b 100644
--- a/paddle/phi/kernels/selected_rows/shape_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/shape_kernel.cc
@@ -52,7 +52,7 @@ PD_REGISTER_KERNEL(shape_sr,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape_sr,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/selected_rows/uniform_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
index 0af5d8788c71f7..d6b422c1325951 100644
--- a/paddle/phi/kernels/selected_rows/uniform_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/uniform_kernel.cc
@@ -77,7 +77,7 @@ PD_REGISTER_KERNEL(uniform_sr,
                    double,
                    phi::dtype::bfloat16) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) ||defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 PD_REGISTER_KERNEL(uniform_raw_sr,
                    GPU,
diff --git a/paddle/phi/kernels/shape_kernel.cc b/paddle/phi/kernels/shape_kernel.cc
index c4190a5f59b62a..e7556d14019549 100644
--- a/paddle/phi/kernels/shape_kernel.cc
+++ b/paddle/phi/kernels/shape_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(shape,
   kernel->OutputAt(0).SetDataType(phi::DataType::INT32);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(shape,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
index aeb09b3fc7c981..ba1672dbf453f9 100644
--- a/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/softmax_grad_kernel.cu
@@ -216,6 +216,9 @@ void SoftmaxCooGradGPUKernel(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   const auto& policy = thrust::hip::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  const auto& policy = thrust::musa::par.on(dev_ctx.stream());
+  bool is_same_offset = thrust::equal(thrust::musa::par.on(dev_ctx.stream()),
 #else
   const auto& policy = thrust::cuda::par.on(dev_ctx.stream());
   bool is_same_offset = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 084cb0e60bb6de..6134004c1bbba0 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -20,6 +20,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_HIP
 #include "paddle/phi/backends/dynload/rocsparse.h"
 #endif
+
+#ifdef PADDLE_WITH_MUSA
+#include "paddle/phi/backends/dynload/musparse.h"
+#endif
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/enforce.h"
@@ -132,6 +137,8 @@ void DenseToCooKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
   thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
+#elif defined(PADDLE_WITH_MUSA)
+  thrust::remove(thrust::musa::par.on(dev_ctx.stream()),
 #else
   thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.cc b/paddle/phi/kernels/squeeze_grad_kernel.cc
index d39bd0c4952b4c..89d1a919b55f11 100644
--- a/paddle/phi/kernels/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/squeeze_grad_kernel.cc
@@ -51,7 +51,7 @@ PD_REGISTER_KERNEL(squeeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/squeeze_kernel.cc b/paddle/phi/kernels/squeeze_kernel.cc
index 684fd0298a3df3..8441c122ddd872 100644
--- a/paddle/phi/kernels/squeeze_kernel.cc
+++ b/paddle/phi/kernels/squeeze_kernel.cc
@@ -78,7 +78,7 @@ PD_REGISTER_KERNEL(squeeze,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(squeeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/stride/as_complex_kernel.cc b/paddle/phi/kernels/stride/as_complex_kernel.cc
index c2e7f816958ebd..2e37baddbad284 100644
--- a/paddle/phi/kernels/stride/as_complex_kernel.cc
+++ b/paddle/phi/kernels/stride/as_complex_kernel.cc
@@ -45,7 +45,7 @@ PD_REGISTER_KERNEL(
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(
     as_complex, GPU, STRIDED, phi::AsComplexStridedKernel, float, double) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
diff --git a/paddle/phi/kernels/stride/as_real_kernel.cc b/paddle/phi/kernels/stride/as_real_kernel.cc
index 92357968809cec..78977558935629 100644
--- a/paddle/phi/kernels/stride/as_real_kernel.cc
+++ b/paddle/phi/kernels/stride/as_real_kernel.cc
@@ -46,7 +46,7 @@ PD_REGISTER_KERNEL(as_real,
   kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(as_real,
                    GPU,
                    STRIDED,
diff --git a/paddle/phi/kernels/stride/complex_grad_kernel.cc b/paddle/phi/kernels/stride/complex_grad_kernel.cc
index 800e484ea7eb88..f268adfcc23e4b 100644
--- a/paddle/phi/kernels/stride/complex_grad_kernel.cc
+++ b/paddle/phi/kernels/stride/complex_grad_kernel.cc
@@ -88,7 +88,7 @@ PD_REGISTER_KERNEL(imag_grad,
   kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(real_grad,
                    GPU,
                    STRIDED,
diff --git a/paddle/phi/kernels/stride/complex_kernel.cc b/paddle/phi/kernels/stride/complex_kernel.cc
index d72bfec2b09f07..b43d511c572367 100644
--- a/paddle/phi/kernels/stride/complex_kernel.cc
+++ b/paddle/phi/kernels/stride/complex_kernel.cc
@@ -78,7 +78,7 @@ PD_REGISTER_KERNEL(imag,
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(real,
                    GPU,
                    STRIDED,
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index 7582f751bf16a5..dd5bd42a3f48ac 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -55,7 +55,7 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strided_slice_kernel.cc b/paddle/phi/kernels/strided_slice_kernel.cc
index 0852cc8830e2c0..6b8b78557fa021 100644
--- a/paddle/phi/kernels/strided_slice_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_kernel.cc
@@ -50,7 +50,7 @@ PD_REGISTER_KERNEL(strided_slice,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(strided_slice,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/case_utils.h b/paddle/phi/kernels/strings/case_utils.h
index 66744c6915bc67..cad57e8a8849e1 100644
--- a/paddle/phi/kernels/strings/case_utils.h
+++ b/paddle/phi/kernels/strings/case_utils.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/pstring.h"
 #include "paddle/phi/kernels/strings/unicode.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
+#if defined(__NVCC__) || defined(__HIPCC__)  || defined(__MUSACC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 
diff --git a/paddle/phi/kernels/strings/gpu/copy_utils.h b/paddle/phi/kernels/strings/gpu/copy_utils.h
index 6e413ef73098dd..3c1acf1b80c37e 100644
--- a/paddle/phi/kernels/strings/gpu/copy_utils.h
+++ b/paddle/phi/kernels/strings/gpu/copy_utils.h
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace phi {
 namespace strings {
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 __global__ void SerializeStringsData(const phi::dtype::pstring* src_str,
                                      uint8_t* strings_data,
                                      int32_t* strings_offset,
@@ -83,6 +83,9 @@ int GetAllStringsSize(const Context& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), hipMemcpyDeviceToHost, dev_ctx.stream());
+#elif defined(PADDLE_WITH_MUSA)
+  phi::backends::gpu::GpuMemcpyAsync(
+      &num, nums_ptr, sizeof(int), musaMemcpyDeviceToHost, dev_ctx.stream());
 #else
   phi::backends::gpu::GpuMemcpyAsync(
       &num, nums_ptr, sizeof(int), cudaMemcpyDeviceToHost, dev_ctx.stream());
@@ -146,7 +149,7 @@ void DeserializeOnCPU(const Context& dev_ctx,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 void SerializeOnGPU(const phi::GPUContext& dev_ctx,
                     const StringTensor& src,
                     DenseTensor* dst) {
@@ -179,6 +182,9 @@ void DeserializeOnGPU(const phi::GPUContext& dev_ctx,
 #ifdef PADDLE_WITH_HIP
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), hipMemcpyDeviceToHost);
+#elif defined(PADDLE_WITH_MUSA)
+  phi::backends::gpu::GpuMemcpySync(
+      &numel, strings_data, sizeof(numel), musaMemcpyDeviceToHost);
 #else
   phi::backends::gpu::GpuMemcpySync(
       &numel, strings_data, sizeof(numel), cudaMemcpyDeviceToHost);
diff --git a/paddle/phi/kernels/strings/strings_empty_kernel.cc b/paddle/phi/kernels/strings/strings_empty_kernel.cc
index 10d958f354e2d3..2c59e6c4e346e8 100644
--- a/paddle/phi/kernels/strings/strings_empty_kernel.cc
+++ b/paddle/phi/kernels/strings/strings_empty_kernel.cc
@@ -49,7 +49,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(
     ALL_LAYOUT,
     phi::strings::EmptyLikeKernel<phi::CPUContext>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(strings_empty,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/strings/unicode.cc b/paddle/phi/kernels/strings/unicode.cc
index 292160e2b2db1a..c50d4a6950e26e 100644
--- a/paddle/phi/kernels/strings/unicode.cc
+++ b/paddle/phi/kernels/strings/unicode.cc
@@ -46,7 +46,7 @@ const uint16_t* GetCharcasesMap() {
   return reinterpret_cast<const uint16_t*>(utils_map[0]);
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap() {
   if (utils_map[3] == nullptr) {
@@ -57,6 +57,10 @@ const uint8_t* GetGPUUniflagMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_uniflag, cpu_uniflag, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_uniflag, cpu_uniflag, size, musaMemcpyHostToDevice);      
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_uniflag), size);
     phi::backends::gpu::GpuMemcpySync(
@@ -76,6 +80,10 @@ const uint16_t* GetGPUCharcasesMap() {
     hipMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(
         gpu_charcases, cpu_charcases, size, hipMemcpyHostToDevice);
+#elif defined(PADDLE_WITH_MUSA)
+    musaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
+    phi::backends::gpu::GpuMemcpySync(
+        gpu_charcases, cpu_charcases, size, musaMemcpyHostToDevice);
 #else
     cudaMalloc(reinterpret_cast<void**>(&gpu_charcases), size);
     phi::backends::gpu::GpuMemcpySync(
diff --git a/paddle/phi/kernels/strings/unicode.h b/paddle/phi/kernels/strings/unicode.h
index 410543c27d68fc..ab0d051a088a51 100644
--- a/paddle/phi/kernels/strings/unicode.h
+++ b/paddle/phi/kernels/strings/unicode.h
@@ -188,7 +188,7 @@ HOSTDEVICE inline void GetUTF8Str(const uint32_t* unicode_str,
 const uint8_t* GetUniFlagMap();
 const uint16_t* GetCharcasesMap();
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 
 const uint8_t* GetGPUUniflagMap();
 const uint16_t* GetGPUCharcasesMap();
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index e4a7f4166c001d..5274a84e3c3125 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -71,7 +71,7 @@ void TransferLayoutGeneral(const Context& dev_ctx,
 
   out->Resize(common::make_ddim(dst_dim));
   dev_ctx.Alloc(out, x.dtype());
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
   // In GPU fp16 model, we will insert many transfer_layout ops in
   // fused_conv2d_add_act_layout_transfer_pass, so we optimize this kernel on
   // GPU
@@ -222,7 +222,7 @@ PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  CPU,
                                  ALL_LAYOUT,
                                  phi::TransferLayoutKernel<phi::CPUContext>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL_FOR_ALL_DTYPE(transfer_layout,
                                  GPU,
                                  ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
index 1603b1e2f63987..4426c744d4931d 100644
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.cc
@@ -50,7 +50,7 @@ PD_REGISTER_KERNEL(unsqueeze_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_grad,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/unsqueeze_kernel.cc b/paddle/phi/kernels/unsqueeze_kernel.cc
index 1f023a7cfb5f4a..605e45fd2368d1 100644
--- a/paddle/phi/kernels/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/unsqueeze_kernel.cc
@@ -80,7 +80,7 @@ PD_REGISTER_KERNEL(unsqueeze,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(PADDLE_WITH_MUSA)
 PD_REGISTER_KERNEL(unsqueeze_infer,
                    GPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/tools/CMakeLists.txt b/paddle/phi/tools/CMakeLists.txt
index e6ae73384180eb..d1cd4b9f3ec182 100644
--- a/paddle/phi/tools/CMakeLists.txt
+++ b/paddle/phi/tools/CMakeLists.txt
@@ -5,6 +5,10 @@ if(WITH_GPU)
   endif()
 endif()
 
+if(WITH_MUSA)
+  return()
+endif()
+
 add_executable(print_phi_kernels print_phi_kernels.cc)
 target_link_libraries(print_phi_kernels phi common)
 if(WIN32)
diff --git a/patches/eigen/Complex.h.patch b/patches/eigen/Complex.h.patch
index d005ea7f5d6f27..140e4ba8409400 100644
--- a/patches/eigen/Complex.h.patch
+++ b/patches/eigen/Complex.h.patch
@@ -1,7 +1,7 @@
-diff --git a/old/Complex.h b/new/Complex.h
-index f6f1b8c..7558e8c 100644
---- a/old/Complex.h
-+++ b/new/Complex.h
+diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
+index f6f1b8c..2f4e7d1 100644
+--- a/Eigen/src/Core/arch/SSE/Complex.h
++++ b/Eigen/src/Core/arch/SSE/Complex.h
 @@ -17,7 +17,7 @@ namespace internal {
  //---------- float ----------
  struct Packet2cf
@@ -11,28 +11,3 @@ index f6f1b8c..7558e8c 100644
    EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
    __m128  v;
  };
-@@ -113,19 +113,13 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<fl
- template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
- {
-   Packet2cf res;
--#if EIGEN_GNUC_AT_MOST(4,2)
--  // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
--  res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
--#elif EIGEN_GNUC_AT_LEAST(4,6)
--  // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
--  #pragma GCC diagnostic push
--  #pragma GCC diagnostic ignored "-Wuninitialized"
--  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
--  #pragma GCC diagnostic pop
-+#ifdef EIGEN_VECTORIZE_SSE3
-+  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
- #else
--  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-+  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
-+  res.v = _mm_movelh_ps(res.v, res.v);
- #endif
--  return Packet2cf(_mm_movelh_ps(res.v,res.v));
-+  return res;
- }
- 
- template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
diff --git a/patches/eigen/Eigen_CORE.patch b/patches/eigen/Eigen_CORE.patch
new file mode 100644
index 00000000000000..cfa265d1d1c396
--- /dev/null
+++ b/patches/eigen/Eigen_CORE.patch
@@ -0,0 +1,13 @@
+diff --git a/Eigen/Core b/Eigen/Core
+index 5921e15f9..772870d88 100644
+--- a/Eigen/Core
++++ b/Eigen/Core
+@@ -25,6 +25,8 @@
+ // the EIGEN_USING_STD macro works properly on the device side
+ #if defined(EIGEN_CUDACC)
+   #include <cuda_runtime.h>
++#elif defined(EIGEN_MUSACC)
++  #include <musa_runtime.h>
+ #elif defined(EIGEN_HIPCC)
+   #include <hip/hip_runtime.h>
+ #endif
diff --git a/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch b/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
new file mode 100644
index 00000000000000..81c05b241e57e7
--- /dev/null
+++ b/patches/eigen/Eigen_src_Core_util_ConfigureVectorization.h.patch
@@ -0,0 +1,21 @@
+diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
+index af4e69623..6944be650 100644
+--- a/Eigen/src/Core/util/ConfigureVectorization.h
++++ b/Eigen/src/Core/util/ConfigureVectorization.h
+@@ -470,6 +470,16 @@
+   #include <hip/hip_fp16.h>
+ #endif
+ 
++#ifdef EIGEN_MUSACC
++  #define EIGEN_VECTORIZE_GPU
++  #include <vector_types.h>
++  #define EIGEN_HAS_MUSA_FP16
++#endif
++
++#ifdef EIGEN_HAS_MUSA_FP16
++  #include <musa_runtime_api.h>
++  #include <musa_fp16.h>
++#endif
+ 
+ /** \brief Namespace containing all symbols from the %Eigen library. */
+ namespace Eigen {
diff --git a/patches/eigen/Eigen_src_Core_util_Macros.h.patch b/patches/eigen/Eigen_src_Core_util_Macros.h.patch
new file mode 100644
index 00000000000000..6de32cf63e259c
--- /dev/null
+++ b/patches/eigen/Eigen_src_Core_util_Macros.h.patch
@@ -0,0 +1,51 @@
+diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
+index bdc0de0ea..8ffbd4291 100644
+--- a/Eigen/src/Core/util/Macros.h
++++ b/Eigen/src/Core/util/Macros.h
+@@ -476,6 +476,19 @@
+   #define EIGEN_CUDA_SDK_VER 0
+ #endif
+ 
++#if defined(__MUSACC__) && !defined(EIGEN_NO_MUSA)
++  // Means the compiler is either nvcc or clang with CUDA enabled
++  #define EIGEN_MUSACC __MUSACC__
++#endif
++#if defined(EIGEN_MUSACC)
++#include <musa.h>
++#endif
++
++#if defined(__MUSA_ARCH__) && !defined(EIGEN_NO_MUSA)
++  // Means we are generating code for the device
++  #define EIGEN_MUSA_ARCH __MUSA_ARCH__
++#endif
++
+ #if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
+   // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
+   #define EIGEN_HIPCC __HIPCC__
+@@ -512,7 +525,7 @@
+ 
+ // Unify CUDA/HIPCC
+ 
+-#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
++#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) || defined(EIGEN_MUSACC)
+ //
+ // If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
+ //
+@@ -535,7 +548,7 @@
+ //
+ #endif
+ 
+-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
++#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_MUSA_ARCH)
+ //
+ // If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
+ //
+@@ -943,7 +956,7 @@
+ // GPU stuff
+ 
+ // Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
+-#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
++#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC) || defined(EIGEN_MUSACC)
+   // Do not try asserts on device code
+   #ifndef EIGEN_NO_DEBUG
+   #define EIGEN_NO_DEBUG
diff --git a/patches/eigen/Eigen_src_Core_util_Meta.h.patch b/patches/eigen/Eigen_src_Core_util_Meta.h.patch
new file mode 100644
index 00000000000000..75fab917c22595
--- /dev/null
+++ b/patches/eigen/Eigen_src_Core_util_Meta.h.patch
@@ -0,0 +1,58 @@
+diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
+index cad57c3a4..785b2a7a9 100755
+--- a/Eigen/src/Core/util/Meta.h
++++ b/Eigen/src/Core/util/Meta.h
+@@ -15,7 +15,7 @@
+ 
+  #include <cfloat>
+ 
+- #if defined(EIGEN_CUDA_ARCH)
++ #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_MUSA_ARCH)
+   #include <math_constants.h>
+  #endif
+ 
+@@ -300,6 +300,8 @@ template<> struct numeric_limits<float>
+   static float (max)() {
+   #if defined(EIGEN_CUDA_ARCH)
+     return CUDART_MAX_NORMAL_F;
++  #elif defined(EIGEN_MUSA_ARCH)
++    return MUSART_MAX_NORMAL_F;
+   #else
+     return HIPRT_MAX_NORMAL_F;
+   #endif
+@@ -310,6 +312,8 @@ template<> struct numeric_limits<float>
+   static float infinity() {
+   #if defined(EIGEN_CUDA_ARCH)
+     return CUDART_INF_F;
++  #elif defined(EIGEN_MUSA_ARCH)
++    return MUSART_INF_F;
+   #else
+     return HIPRT_INF_F;
+   #endif
+@@ -318,6 +322,8 @@ template<> struct numeric_limits<float>
+   static float quiet_NaN() {
+   #if defined(EIGEN_CUDA_ARCH)
+     return CUDART_NAN_F;
++  #elif defined(EIGEN_MUSA_ARCH)
++    return MUSART_NAN_F;
+   #else
+     return HIPRT_NAN_F;
+   #endif
+@@ -335,6 +341,8 @@ template<> struct numeric_limits<double>
+   static double infinity() {
+   #if defined(EIGEN_CUDA_ARCH)
+     return CUDART_INF;
++  #elif defined(EIGEN_MUSA_ARCH)
++    return MUSART_INF;
+   #else
+     return HIPRT_INF;
+   #endif
+@@ -343,6 +351,8 @@ template<> struct numeric_limits<double>
+   static double quiet_NaN() {
+   #if defined(EIGEN_CUDA_ARCH)
+     return CUDART_NAN;
++  #elif defined(EIGEN_MUSA_ARCH)
++    return MUSART_NAN;
+   #else
+     return HIPRT_NAN;
+   #endif
diff --git a/patches/eigen/TensorReductionGpu.h b/patches/eigen/TensorReductionGpu.h
index 4807aaa2c1be75..696078e54881af 100644
--- a/patches/eigen/TensorReductionGpu.h
+++ b/patches/eigen/TensorReductionGpu.h
@@ -14,7 +14,7 @@ namespace Eigen {
 namespace internal {
 
 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-// Full reducers for GPU, don't common::vectorize for now
+// Full reducers for GPU, don't vectorize for now
 
 // Reducer function that enables multiple gpu thread to safely accumulate at the same
 // output address. It basically reads the current value of the output variable, and
diff --git a/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch b/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
new file mode 100644
index 00000000000000..c7d9f134ff004f
--- /dev/null
+++ b/patches/eigen/unsupported_Eigen_CXX11_Tensor.patch
@@ -0,0 +1,13 @@
+diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
+index d73c6008d..73c02cc50 100644
+--- a/unsupported/Eigen/CXX11/Tensor
++++ b/unsupported/Eigen/CXX11/Tensor
+@@ -57,6 +57,8 @@
+   #include <iostream>
+   #if defined(EIGEN_USE_HIP)
+     #include <hip/hip_runtime.h>
++  #elif defined(EIGEN_USE_MUSA)
++    #include <musa_runtime.h>
+   #else
+     #include <cuda_runtime.h>
+   #endif
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
new file mode 100644
index 00000000000000..aadb9080b24b29
--- /dev/null
+++ b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorContractionGpu.h.patch
@@ -0,0 +1,22 @@
+diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+index bb990b378..07f93ab18 100644
+--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
++++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+@@ -621,7 +621,7 @@ EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rh
+       x1 = rhs_pf0.x;
+       x2 = rhs_pf0.z;
+     }
+-    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
++    #if !defined(EIGEN_MUSACC) && (defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000))
+     x1 = __shfl_xor(x1, 4);
+     x2 = __shfl_xor(x2, 4);
+     #else
+@@ -1399,6 +1399,8 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
+ 
+ #if defined(EIGEN_USE_HIP)
+     setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
++#elif defined(EIGEN_USE_MUSA)
++    setGpuSharedMemConfig(musaSharedMemBankSizeEightByte);
+ #else
+     setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+ #endif
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
new file mode 100644
index 00000000000000..60be20eb476e06
--- /dev/null
+++ b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceDefault.h.patch
@@ -0,0 +1,15 @@
+diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+index 46b9d3ab2..3bef5b621 100644
+--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
++++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+@@ -92,6 +92,10 @@ struct DefaultDevice {
+     // Running on a HIP device
+     // return 1 as major for HIP
+     return 1;
++#elif defined(EIGEN_MUSA_ARCH)
++    // Running on a MUSA device
++    // return 1 as major for MUSA
++    return 1;
+ #else
+     // Running on a CUDA device
+     return EIGEN_CUDA_ARCH / 100;
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
new file mode 100644
index 00000000000000..631770d22e49ed
--- /dev/null
+++ b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorDeviceGpu.h.patch
@@ -0,0 +1,15 @@
+diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+index 9422dcd7a..e889b7eca 100644
+--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
++++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+@@ -331,6 +331,10 @@ struct GpuDevice {
+   hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+   gpu_assert(hipGetLastError() == hipSuccess);
+ 
++#elif defined(EIGEN_MUSACC)
++#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
++  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
++  gpu_assert(musaGetLastError() == musaSuccess);
+ #else
+  
+ #define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
new file mode 100644
index 00000000000000..3d8b48c3d6af60
--- /dev/null
+++ b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorGpuHipCudaDefines.h.patch
@@ -0,0 +1,40 @@
+diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+index cb53ce298..19dc119a9 100644
+--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
++++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+@@ -52,6 +52,35 @@
+ #define gpuDeviceSynchronize hipDeviceSynchronize
+ #define gpuMemcpy hipMemcpy
+ 
++#elif defined(EIGEN_USE_MUSA)
++#define gpuStream_t musaStream_t
++#define gpuDeviceProp_t musaDeviceProp
++#define gpuError_t musaError_t
++#define gpuSuccess musaSuccess
++#define gpuErrorNotReady musaErrorNotReady
++#define gpuGetDeviceCount musaGetDeviceCount
++#define gpuGetLastError musaGetLastError
++#define gpuPeekAtLastError musaPeekAtLastError
++#define gpuGetErrorName musaGetErrorName
++#define gpuGetErrorString musaGetErrorString
++#define gpuGetDeviceProperties musaGetDeviceProperties
++#define gpuStreamDefault musaStreamDefault
++#define gpuGetDevice musaGetDevice
++#define gpuSetDevice musaSetDevice
++#define gpuMalloc musaMalloc
++#define gpuFree musaFree
++#define gpuMemsetAsync musaMemsetAsync
++#define gpuMemcpyAsync musaMemcpyAsync
++#define gpuMemcpyDeviceToDevice musaMemcpyDeviceToDevice
++#define gpuMemcpyDeviceToHost musaMemcpyDeviceToHost
++#define gpuMemcpyHostToDevice musaMemcpyHostToDevice
++#define gpuStreamQuery musaStreamQuery
++#define gpuSharedMemConfig musaSharedMemConfig
++#define gpuDeviceSetSharedMemConfig musaDeviceSetSharedMemConfig
++#define gpuStreamSynchronize musaStreamSynchronize
++#define gpuDeviceSynchronize musaDeviceSynchronize
++#define gpuMemcpy musaMemcpy
++
+ #else
+ 
+ #define gpuStream_t cudaStream_t
diff --git a/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch
new file mode 100644
index 00000000000000..497f464e461d72
--- /dev/null
+++ b/patches/eigen/unsupported_Eigen_CXX11_src_Tensor_TensorReduction.h.patch
@@ -0,0 +1,13 @@
+diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+index 0a65591e6..74679b700 100644
+--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
++++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+@@ -14,7 +14,7 @@
+ // clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+ // so we'll use a macro to make clang happy.
+ #ifndef KERNEL_FRIEND
+-#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
++#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__) || defined(__MUSA__))
+ #define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
+ #else
+ #define KERNEL_FRIEND friend
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index fddfba389ae4ed..cd8fcc4048f623 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -48,6 +48,8 @@ if(WITH_GPU)
   set(PACKAGE_NAME "paddlepaddle-gpu")
 elseif(WITH_ROCM)
   set(PACKAGE_NAME "paddlepaddle-rocm")
+elseif(WITH_MUSA)
+  set(PACKAGE_NAME "paddlepaddle-musa")
 elseif(WITH_XPU)
   set(PACKAGE_NAME "paddlepaddle-xpu")
 elseif(WITH_IPU)
diff --git a/python/env_dict.py.in b/python/env_dict.py.in
index 58900192743e47..4731a3750ddf6e 100644
--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -74,6 +74,7 @@ env_dict={
     'APPLE':'@APPLE@',
     'externalError_INCLUDE_DIR':'@externalError_INCLUDE_DIR@',
     'WITH_ROCM':'@WITH_ROCM@',
+    'WITH_MUSA':'@WITH_MUSA@',
     'ORIGIN':'@ORIGIN@',
     'WIN32':'@WIN32@',
     'JIT_RELEASE_WHL':'@JIT_RELEASE_WHL@',
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f6b3aca02c2a61..d8d3b110a5e0dd 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -517,6 +517,7 @@
     is_compiled_with_distribute,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
+    is_compiled_with_musa,
     is_compiled_with_custom_device,
 )
 
diff --git a/python/paddle/base/__init__.py b/python/paddle/base/__init__.py
index 7e5ac9c1d92c44..a0eab9920fb737 100644
--- a/python/paddle/base/__init__.py
+++ b/python/paddle/base/__init__.py
@@ -55,6 +55,7 @@
     is_compiled_with_cinn,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
+    is_compiled_with_musa,
     is_compiled_with_xpu,
     Variable,
     require_version,
diff --git a/python/paddle/base/framework.py b/python/paddle/base/framework.py
index 0c4079fd199306..c3a65971ffd983 100644
--- a/python/paddle/base/framework.py
+++ b/python/paddle/base/framework.py
@@ -887,6 +887,20 @@ def is_compiled_with_rocm():
     """
     return core.is_compiled_with_rocm()
 
+def is_compiled_with_musa():
+    """
+    Whether this whl package can be used to run the model on musa.
+
+    Returns:
+        Bool: `True` if ROCm is currently available, otherwise `False`.
+
+    Examples:
+        .. code-block:: python
+
+            >>> import paddle
+            >>> support_gpu = paddle.device.is_compiled_with_musa()
+    """
+    return core.is_compiled_with_musa()
 
 def cuda_places(device_ids=None):
     """
diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py
index b4fd9629358137..f047ee7411f4a6 100644
--- a/python/paddle/device/__init__.py
+++ b/python/paddle/device/__init__.py
@@ -24,6 +24,7 @@
     is_compiled_with_cuda,
     is_compiled_with_distribute,
     is_compiled_with_rocm,
+    is_compiled_with_musa
 )
 
 from . import (  # noqa: F401
@@ -42,6 +43,7 @@
     'is_compiled_with_cinn',
     'is_compiled_with_cuda',
     'is_compiled_with_rocm',
+    'is_compiled_with_musa',
     'is_compiled_with_distribute',
     'is_compiled_with_custom_device',
     'get_all_device_type',
diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index 598bf64a103871..01ff24c5fc1a06 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -21,9 +21,10 @@
     CUDAPlace,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
+    is_compiled_with_musa
 )
 
-if is_compiled_with_cuda() and not is_compiled_with_rocm():
+if is_compiled_with_cuda() and not is_compiled_with_rocm() and not is_compiled_with_musa():
     from paddle.base.core import CUDAGraph as CoreCUDAGraph
 
     def is_cuda_graph_supported():
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 2b6b6eec7748c3..278775e3422725 100755
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -504,7 +504,7 @@ def start_local_trainers(
     else:
         current_env = copy.copy(envs)
 
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index 05f3e2d241ab84..143db9e1033eab 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -38,7 +38,7 @@ def is_fused_matmul_bias_supported():
 
 
 def is_fused_linear_param_grad_add_supported():
-    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm() and not paddle.is_compiled_with_musa():
         return hasattr(paddle._C_ops, 'fused_linear_param_grad_add')
     else:
         return False
diff --git a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
index 940d7408ff5be7..624cdd4d923eb1 100644
--- a/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
+++ b/python/paddle/distributed/fleet/utils/sequence_parallel_utils.py
@@ -220,6 +220,7 @@ def is_fused_matmul_bias_supported():
     if (
         paddle.is_compiled_with_cuda()
         and not paddle.is_compiled_with_rocm()
+        and not paddle.is_compiled_with_musa()
         or paddle.is_compiled_with_xpu()
     ):
         return hasattr(core.eager.ops.legacy, "fused_gemm_epilogue")
diff --git a/python/paddle/distributed/launch/utils/nvsmi.py b/python/paddle/distributed/launch/utils/nvsmi.py
index 232ccce2209cce..db494f4d8bcf0d 100644
--- a/python/paddle/distributed/launch/utils/nvsmi.py
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -135,6 +135,8 @@ def get_gpu_util(index=None):
     )
     if paddle.device.is_compiled_with_rocm():
         return query_rocm_smi(q, index=index, dtype=d)
+    if paddle.device.is_compiled_with_musa():
+        return query_musa_smi(q, index=index, dtype=d)
     return query_smi(q, index=index, dtype=d)
 
 
diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py
index b06201dc89472f..6dba2751dd600d 100644
--- a/python/paddle/distributed/utils/launch_utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -467,7 +467,7 @@ def start_local_trainers(
     cluster, pod, training_script, training_script_args, log_dir=None
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index df837215993541..e2ea5644a8847b 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -18,6 +18,7 @@
     get_all_custom_device_type,
     is_compiled_with_cuda,
     is_compiled_with_rocm,
+    is_compiled_with_musa
 )
 from paddle.tensor.manipulation import reshape
 from paddle.tensor.math import _add_with_axis
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 63d1c1c19403ac..7a64da7d1e84fe 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -18,7 +18,7 @@
 from ...base.data_feeder import check_variable_and_dtype
 from ...base.layer_helper import LayerHelper
 from ...common_ops_import import Variable
-from ...device import get_cudnn_version, is_compiled_with_rocm
+from ...device import get_cudnn_version, is_compiled_with_rocm,is_compiled_with_musa
 
 __all__ = []
 
diff --git a/python/setup.py.in b/python/setup.py.in
index d2dd7f206f671f..a42163b296dbdd 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -809,7 +809,7 @@ for f in jit_layer_headers:
 if '${WITH_MKLDNN}' == 'ON':
     headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn
 
-if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
+if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON' or '${WITH_MUSA}' == 'ON':
     # externalErrorMsg.pb for External Error message
     headers += list(find_files('*.pb', '${externalError_INCLUDE_DIR}'))
 
diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt
index 5a0e2c0d859ec1..b7c44f8372a95c 100644
--- a/test/collective/fleet/CMakeLists.txt
+++ b/test/collective/fleet/CMakeLists.txt
@@ -831,7 +831,7 @@ if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
     ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
 endif()
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   if(WITH_DGC)
     if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
       bash_test_modules(
@@ -846,7 +846,7 @@ if(WITH_NCCL OR WITH_RCCL)
     endif()
   endif()
 endif()
-if(WITH_NCCL OR WITH_RCCL)
+if(WITH_NCCL OR WITH_RCCL OR WITH_MCCL)
   if(WITH_DGC)
     if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
       bash_test_modules(
diff --git a/test/collective/fleet/test_parallel_dygraph_qat.py b/test/collective/fleet/test_parallel_dygraph_qat.py
index a5b736ce5b2917..d3e204f437f398 100644
--- a/test/collective/fleet/test_parallel_dygraph_qat.py
+++ b/test/collective/fleet/test_parallel_dygraph_qat.py
@@ -61,7 +61,7 @@ def start_local_trainers(
     log_dir=None,
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/cpp/fluid/nccl/CMakeLists.txt b/test/cpp/fluid/nccl/CMakeLists.txt
index a8bd7b7f556345..0df57b31f16db2 100644
--- a/test/cpp/fluid/nccl/CMakeLists.txt
+++ b/test/cpp/fluid/nccl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT (WITH_NCCL OR WITH_RCCL))
+if(NOT (WITH_NCCL OR WITH_RCCL  OR WITH_MCCL))
   return()
 endif()
 
diff --git a/test/cpp/fluid/nccl/nccl_op_test.cu.cc b/test/cpp/fluid/nccl/nccl_op_test.cu.cc
index b8a47b97031653..5245417ddc40db 100644
--- a/test/cpp/fluid/nccl/nccl_op_test.cu.cc
+++ b/test/cpp/fluid/nccl/nccl_op_test.cu.cc
@@ -31,12 +31,12 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 
 USE_NO_KERNEL_OP(ncclInit);
-USE_OP_ITSELF(ncclAllReduce);
+USE_OP_ITSELF(mcclAllReduce);
 USE_OP_ITSELF(ncclReduce);
-USE_OP_ITSELF(ncclBcast);
-PD_DECLARE_KERNEL(ncclAllReduce, GPU, ALL_LAYOUT);
+USE_OP_ITSELF(mcclBcast);
+PD_DECLARE_KERNEL(mcclAllReduce, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(ncclReduce, GPU, ALL_LAYOUT);
-PD_DECLARE_KERNEL(ncclBcast, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(mcclBcast, GPU, ALL_LAYOUT);
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
@@ -136,7 +136,7 @@ class NCCLTester : public ::testing::Test {
 
 void NCCLTester::testNcclAllReduceOp() {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
-  op2->SetType("ncclAllReduce");
+  op2->SetType("mcclAllReduce");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
@@ -249,7 +249,7 @@ void NCCLTester::testNcclReduceOp() {
 void NCCLTester::testNcclBcastOp() {
   std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
   const int kRoot = 0;
-  op2->SetType("ncclBcast");
+  op2->SetType("mcclBcast");
   op2->SetInput("X", {"st"});
   op2->SetInput("Communicator", {"comm"});
   op2->SetOutput("Out", {"rt"});
diff --git a/test/cpp/imperative/CMakeLists.txt b/test/cpp/imperative/CMakeLists.txt
index 491a008a963283..372d4fe72fd31f 100644
--- a/test/cpp/imperative/CMakeLists.txt
+++ b/test/cpp/imperative/CMakeLists.txt
@@ -4,7 +4,7 @@ if(WIN32)
     SRCS nccl_context_test.cc
     DEPS device_context)
 else()
-  if(WITH_GLOO AND (WITH_NCCL OR WITH_RCCL))
+  if(WITH_GLOO AND (WITH_NCCL OR WITH_RCCL OR WITH_MCCL))
     cc_test(
       nccl_context_test
       SRCS nccl_context_test.cc
@@ -85,6 +85,7 @@ cc_test(
   DEPS tracer layer prepared_operator mul_op)
 if(WITH_NCCL
    OR WITH_RCCL
+   OR WITH_MCCL
    OR WITH_XPU_BKCL)
   cc_test(
     test_group
diff --git a/test/cpp/imperative/nccl_context_test.cc b/test/cpp/imperative/nccl_context_test.cc
index 8b9958ee561824..55ccc9f6391840 100644
--- a/test/cpp/imperative/nccl_context_test.cc
+++ b/test/cpp/imperative/nccl_context_test.cc
@@ -38,7 +38,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
 }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) {
+void BcastNCCLId(int local_rank, std::vector<mcclUniqueId>* nccl_ids) {
   auto strategy = GetStrategy(local_rank);
   int server_fd = platform::CreateListenSocket(strategy.current_endpoint_);
 
@@ -50,18 +50,18 @@ void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) {
 }
 
 TEST(BcastNCCLId, Run) {
-  std::vector<ncclUniqueId> nccl_ids;
+  std::vector<mcclUniqueId> nccl_ids;
   nccl_ids.resize(nrings);
   for (int i = 0; i < nrings; ++i) {
-    platform::dynload::ncclGetUniqueId(&nccl_ids[i]);
+    platform::dynload::mcclGetUniqueId(&nccl_ids[i]);
   }
 
   std::thread t(BcastNCCLId, 0, &nccl_ids);
 
-  std::vector<ncclUniqueId> recv_nccl_ids;
+  std::vector<mcclUniqueId> recv_nccl_ids;
   recv_nccl_ids.resize(nrings);
   for (int i = 0; i < nrings; ++i) {
-    platform::dynload::ncclGetUniqueId(&recv_nccl_ids[i]);
+    platform::dynload::mcclGetUniqueId(&recv_nccl_ids[i]);
   }
   BcastNCCLId(1, &recv_nccl_ids);
 
diff --git a/test/custom_runtime/test_collective_process_group_xccl.py b/test/custom_runtime/test_collective_process_group_xccl.py
index 060d76de19664e..83690a8ac11348 100644
--- a/test/custom_runtime/test_collective_process_group_xccl.py
+++ b/test/custom_runtime/test_collective_process_group_xccl.py
@@ -37,7 +37,7 @@ def start_local_trainers(
     )
 
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 476d8f5d02a88f..0d54fa7ea37400 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -213,7 +213,7 @@ elseif(WITH_GPU)
   endif()
 endif()
 
-if((NOT WITH_NCCL) AND (NOT WITH_RCCL))
+if((NOT WITH_NCCL) AND (NOT WITH_RCCL) AND (NOT WITH_MCCL))
   list(REMOVE_ITEM TEST_OPS test_imperative_group)
 endif()
 
diff --git a/test/legacy_test/test_dist_hapi_model.py b/test/legacy_test/test_dist_hapi_model.py
index 03a92d6f3cbc91..a439b517e851ef 100644
--- a/test/legacy_test/test_dist_hapi_model.py
+++ b/test/legacy_test/test_dist_hapi_model.py
@@ -60,7 +60,7 @@ def start_local_trainers(
     log_dir=None,
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel.py b/test/legacy_test/test_parallel_dygraph_dataparallel.py
index 917f8f2ff1bf17..5a9bb153c3cc9d 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel.py
@@ -107,7 +107,7 @@ def start_local_trainers(
     need_envs={},
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
index 5a944284414bf0..b61b2a556f8516 100644
--- a/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
+++ b/test/legacy_test/test_parallel_dygraph_dataparallel_cpuonly.py
@@ -55,7 +55,7 @@ def start_local_trainers(
     cluster, pod, training_script, training_script_args, log_dir=None
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/test/xpu/test_parallel_dygraph_dataparallel.py b/test/xpu/test_parallel_dygraph_dataparallel.py
index 0070f8ade98024..9f56e22d06d9ad 100644
--- a/test/xpu/test_parallel_dygraph_dataparallel.py
+++ b/test/xpu/test_parallel_dygraph_dataparallel.py
@@ -62,7 +62,7 @@ def start_local_trainers(
     log_dir=None,
 ):
     current_env = copy.copy(os.environ.copy())
-    # paddle broadcast ncclUniqueId use socket, and
+    # paddle broadcast mcclUniqueId use socket, and
     # proxy maybe make trainers unreachable, so delete them.
     # if we set them to "", grpc will log error message "bad uri"
     # so just delete them.
diff --git a/tools/enforce/grep_invalid_enforce.sh b/tools/enforce/grep_invalid_enforce.sh
index 04243bfb9afaf0..c12f9b9481c62b 100644
--- a/tools/enforce/grep_invalid_enforce.sh
+++ b/tools/enforce/grep_invalid_enforce.sh
@@ -39,7 +39,7 @@
 #         PADDLE_THROW("binding failed on ep: %s", ep);
 #         PADDLE_THROW("listen on server fd failed");
 #         PADDLE_THROW("accept the new socket fd failed");
-#         PADDLE_THROW("reading the ncclUniqueId from socket failed");
+#         PADDLE_THROW("reading the mcclUniqueId from socket failed");
 #         PADDLE_ENFORCE_EQ(addr.size(), 2UL,
 #                             "The endpoint should contain host and port: %s", ep);
 #         PADDLE_THROW("create socket failed");

From 5a18c2a1e27330f29865cbb9d001217b98c534bb Mon Sep 17 00:00:00 2001
From: "haowen.han@mthreads.com" <haowen.han@mthreads.com>
Date: Sat, 11 May 2024 17:32:28 +0800
Subject: [PATCH 3/6] enable WITH_DISTRIBUTED in CMakeLists.txt and port
 related source file from cuda to musa

---
 CMakeLists.txt                                            | 8 +++++++-
 paddle/fluid/operators/collective/c_reduce_op.h           | 2 +-
 .../fluid/operators/collective/c_reducescatter_op.cu.cc   | 2 +-
 paddle/fluid/operators/collective/global_gather_op.cu.cc  | 4 ++--
 paddle/fluid/operators/collective/global_scatter_op.cu.cc | 4 ++--
 paddle/fluid/operators/collective/partial_recv_op.cu.cc   | 2 +-
 paddle/fluid/operators/collective/partial_send_op.cu.cc   | 2 +-
 paddle/fluid/operators/collective/recv_v2_op.cu.cc        | 8 ++++----
 paddle/fluid/operators/collective/send_v2_op.cu.cc        | 8 ++++----
 paddle/phi/backends/dynload/mccl.h                        | 2 +-
 10 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 057865eaf9b54d..da58f0095ae09d 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -256,7 +256,7 @@ option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
 option(WITH_MULTINODE_TESTING "Test multinode apis and ops" OFF)
 option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
 option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
-option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
+option(WITH_DISTRIBUTE "Compile with distributed support" ON)
 option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
 option(ON_INFER "Turn on inference optimization and inference-lib generation"
        ON)
@@ -690,6 +690,12 @@ include(configure) # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
+# distribute need openssl
+# openssl install tutorial: https://www.howtoforge.com/tutorial/how-to-install-openssl-from-source-on-linux/
+include_directories("/usr/local/ssl/include")
+link_directories("/usr/local/ssl/lib64")
+
+
 if(WITH_NV_JETSON)
   set(WITH_ARM
       ON
diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h
index d41500038745b9..26cacdd87fa863 100644
--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -315,7 +315,7 @@ class CReduceOpCUDAKernel : public framework::OpKernel<T> {
     if (comm_ctx) {
       comm_ctx->Reduce(out, *in, nccl_red_type, root, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(sendbuff,
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduce(sendbuff,
                                                                recvbuff,
                                                                numel,
                                                                dtype,
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index a39ecdfe847eb8..af26bf7d858ba0 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -107,7 +107,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel<T> {
     if (comm_ctx) {
       comm_ctx->ReduceScatter(out, *in, mcclSum, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclReduceScatter(
           send_buff,
           recv_buff,
           recv_numel,
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
index 2c9abe54dcd39e..a1e09d2c35cbb8 100644
--- a/paddle/fluid/operators/collective/global_gather_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -169,7 +169,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
         for (auto j = 0; j < nranks; ++j) {
           int idx = i + j * n_expert;
           if (cpu_global_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
                 send_buf + send_ptr * in_feat,
                 cpu_global_count_data[idx] * in_feat,
                 dtype,
@@ -179,7 +179,7 @@ struct GlobalGatherFunctor<phi::GPUContext, T> {
             send_ptr += cpu_global_count_data[idx];
           }
           if (cpu_local_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
                 recv_buf + expert_ptr[idx] * in_feat,
                 cpu_local_count_data[idx] * in_feat,
                 dtype,
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
index 4331f6f818a58e..38a992d3baaa31 100644
--- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -177,7 +177,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
         for (auto j = 0; j < nranks; ++j) {
           int idx = i + j * n_expert;
           if (cpu_local_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
                 send_buf + expert_ptr[idx] * in_feat,
                 cpu_local_count_data[idx] * in_feat,
                 dtype,
@@ -186,7 +186,7 @@ struct GlobalScatterFunctor<phi::GPUContext, T> {
                 stream));
           }
           if (cpu_global_count_data[idx]) {
-            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
                 recv_buf + recv_ptr * in_feat,
                 cpu_global_count_data[idx] * in_feat,
                 dtype,
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index fbe9e8b84da48c..fdfb31e7b2eab1 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -150,7 +150,7 @@ class PartialRecvOpCUDAKernel : public framework::OpKernel<T> {
         comm_ctx->Recv(&recv_buf, recv_numel, peer, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::ncclRecv(out->data<T>() + offset,
+            platform::dynload::mcclRecv(out->data<T>() + offset,
                                         recv_numel,
                                         dtype,
                                         peer,
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index cc1dda5715a8e9..d395f3a5febb34 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -145,7 +145,7 @@ class PartialSendCUDAKernel : public framework::OpKernel<T> {
         comm_ctx->Send(send_buf, send_numel, peer, stream);
       } else {
         PADDLE_ENFORCE_GPU_SUCCESS(
-            platform::dynload::ncclSend(x->data<T>() + offset,
+            platform::dynload::mcclSend(x->data<T>() + offset,
                                         send_numel,
                                         dtype,
                                         peer,
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index 4bfd579efb7b4e..283e75d7a53e87 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -59,7 +59,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
     if (comm_ctx) {
       comm_ctx->Recv(&gpu_shape_size_tensor, 1, peer, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
           gpu_data, 1, nccl_dtype, peer, comm->comm(), stream));
     }
   }
@@ -89,7 +89,7 @@ framework::DDim recv_shape_info(const platform::Place &place,
     if (comm_ctx) {
       comm_ctx->Recv(&gpu_shape_tensor, shape_size, peer, stream);
     } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
           gpu_shape_data, shape_size, nccl_dtype, peer, comm->comm(), stream));
     }
   }
@@ -233,7 +233,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
         if (comm_ctx) {
           comm_ctx->Recv(out, numel, peer, stream);
         } else {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
               out->data<T>(), numel, dtype, peer, comm->comm(), stream));
           VLOG(3) << "rank " << comm->rank() << " recv "
                   << common::product(out_dims) << " from " << peer;
@@ -272,7 +272,7 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
                             "be less than comm->nranks (%d).",
                             peer,
                             comm->nranks()));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclRecv(
           out->data<T>(), numel, dtype, peer, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " recv "
               << common::product(out->dims()) << " from " << peer;
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 5c07522ab45e65..5ad3124b32017d 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -72,7 +72,7 @@ void send_shape_info(const phi::DenseTensor& x,
       comm_ctx->Send(*gpu_shape_size_tensor, 1, peer, stream);
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclSend(gpu_shape_size_tensor->data<int>(),
+          platform::dynload::mcclSend(gpu_shape_size_tensor->data<int>(),
                                       1,
                                       nccl_dtype,
                                       peer,
@@ -105,7 +105,7 @@ void send_shape_info(const phi::DenseTensor& x,
       comm_ctx->Send(*gpu_shape_tensor, shape_size, peer, stream);
     } else {
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::ncclSend(gpu_shape_tensor->data<int>(),
+          platform::dynload::mcclSend(gpu_shape_tensor->data<int>(),
                                       shape_size,
                                       nccl_dtype,
                                       peer,
@@ -220,7 +220,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         if (comm_ctx) {
           comm_ctx->Send(x, numel, peer, stream);
         } else {
-          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
               x.data<T>(), numel, dtype, peer, comm->comm(), stream));
         }
         VLOG(3) << "rank " << comm->rank() << " send "
@@ -247,7 +247,7 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
     } else {
       mcclDataType_t dtype =
           platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
-      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::mcclSend(
           x->data<T>(), numel, dtype, peer, comm->comm(), stream));
       VLOG(3) << "rank " << comm->rank() << " send "
               << common::product(x->dims()) << " to " << peer;
diff --git a/paddle/phi/backends/dynload/mccl.h b/paddle/phi/backends/dynload/mccl.h
index 14320ff89f0c05..4e2eaeea00afa3 100644
--- a/paddle/phi/backends/dynload/mccl.h
+++ b/paddle/phi/backends/dynload/mccl.h
@@ -51,8 +51,8 @@ extern void* mccl_dso_handle;
   __macro(mcclCommUserRank);            \
   __macro(mcclAllReduce);               \
   __macro(mcclBcast);                   \
-  __macro(mcclAllGather);               \
   __macro(mcclGroupStart);              \
+  __macro(mcclAllGather);               \
   __macro(mcclGroupEnd);                \
   __macro(mcclReduce);                  \
   __macro(mcclReduceScatter);           \

From bc0a153f46bb9a45b01aa38e46f994781968bab0 Mon Sep 17 00:00:00 2001
From: "haowen.han@mthreads.com" <haowen.han@mthreads.com>
Date: Sat, 11 May 2024 19:37:48 +0800
Subject: [PATCH 4/6] fix some bugs when WITH_DISTRIBUTED is enabled

---
 cmake/operators.cmake                         | 23 ++++-
 paddle/fluid/operators/dgc_clip_by_norm_op.h  | 76 +++++++++--------
 paddle/fluid/operators/sync_batch_norm_op.cu  | 84 ++++++++++---------
 paddle/phi/kernels/funcs/CMakeLists.txt       |  2 -
 .../fleet/test_dygraph_sharding_stage2.py     |  9 +-
 .../fleet/test_parallel_dygraph_mp_layers.py  |  5 +-
 test/legacy_test/test_adaptive_avg_pool1d.py  |  1 +
 test/legacy_test/test_yolov3_loss_op.py       |  3 +-
 8 files changed, 115 insertions(+), 88 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 825e7612f3487f..60966c41e95b93 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -453,6 +453,7 @@ function(op_library TARGET)
     list(REMOVE_ITEM mu_srcs "lstsq_op.cu")
     list(REMOVE_ITEM mu_srcs "multinomial_op.cu")
     list(REMOVE_ITEM mu_srcs "multiclass_nms3_op.cu")
+    message(STATUS "mu_cc_srcs: ${mu_cc_srcs}, cc_srcs: ${cc_srcs}")
     musa_library(
       ${TARGET}
       SRCS ${cc_srcs} ${mu_cc_srcs} ${mudnn_cu_cc_srcs} ${mudnn_cu_srcs}
@@ -491,8 +492,10 @@ function(op_library TARGET)
 
   list(LENGTH cu_srcs cu_srcs_len)
   list(LENGTH hip_srcs hip_srcs_len)
+  list(LENGTH mu_srcs mu_srcs_len)
   list(LENGTH cu_cc_srcs cu_cc_srcs_len)
   list(LENGTH hip_cc_srcs hip_cc_srcs_len)
+  list(LENGTH mu_cc_srcs mu_cc_srcs_len)
   list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
   list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
   list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
@@ -603,6 +606,23 @@ function(op_library TARGET)
     endif()
   endforeach()
 
+  # pybind USE_OP_DEVICE_KERNEL for MUSA
+  list(APPEND mu_srcs ${mu_cc_srcs})
+  message("mu_srcs ${mu_srcs}")
+  foreach(mu_src ${mu_srcs})
+    set(op_name "")
+    find_register(${mu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
+    find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_KERNEL")
+    find_phi_register(${mu_src} ${pybind_file} "PD_REGISTER_STRUCT_KERNEL")
+    find_phi_register(${mu_src} ${pybind_file}
+                      "PD_REGISTER_KERNEL_FOR_ALL_DTYPE")
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
+      set(pybind_flag 1)
+    endif()
+  endforeach()
+
+
   # pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN
   list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs})
   list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs})
@@ -610,7 +630,6 @@ function(op_library TARGET)
   list(APPEND cudnn_cu_srcs ${mudnn_cu_cc_srcs})
   list(APPEND cudnn_cu_srcs ${mudnn_cu_srcs})  
   list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
-  message("cudnn_cu_srcs ${cudnn_cu_srcs}")
   if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL
                                         "activation_op")
     file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
@@ -725,7 +744,7 @@ function(register_operators)
   string(REPLACE ".cc" "" OPS "${OPS}")
   list(REMOVE_DUPLICATES OPS)
   list(LENGTH register_operators_DEPS register_operators_DEPS_len)
-
+  message(STATUS "OPS in register_operators:${OPS}")
   foreach(src ${OPS})
     list(FIND register_operators_EXCLUDES ${src} _index)
     if(${_index} EQUAL -1)
diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h
index d954ea1bf82af7..807f7e907e5ce4 100644
--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/operators/clip_by_norm_op.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/kernels/clip_by_norm_kernel.h"
 #include "paddle/phi/kernels/selected_rows/clip_by_norm_kernel.h"
 
@@ -25,48 +26,49 @@ template <typename T, typename DeviceContext>
 class DGCClipByNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
-    if (static_cast<int>(rampup_begin_step) < 0) {
-      return;
-    }
+    PADDLE_ENFORCE(false, "not supported");
+    // auto rampup_begin_step = ctx.Attr<float>("rampup_begin_step");
+    // if (static_cast<int>(rampup_begin_step) < 0) {
+    //   return;
+    // }
 
-    auto current_step_tensor = ctx.Input<phi::DenseTensor>("current_step");
-    auto* current_step = current_step_tensor->data<T>();
+    // auto current_step_tensor = ctx.Input<phi::DenseTensor>("current_step");
+    // auto* current_step = current_step_tensor->data<T>();
 
-    VLOG(10) << "current_step:" << *current_step
-             << ", rampup_begin_step:" << rampup_begin_step;
+    // VLOG(10) << "current_step:" << *current_step
+    //          << ", rampup_begin_step:" << rampup_begin_step;
 
-    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
-      VLOG(10) << "current_step:" << *current_step
-               << " < rampup_begin_step:" << rampup_begin_step
-               << " so does't use dgc_clip_by_norm";
-      return;
-    }
+    // if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
+    //   VLOG(10) << "current_step:" << *current_step
+    //            << " < rampup_begin_step:" << rampup_begin_step
+    //            << " so does't use dgc_clip_by_norm";
+    //   return;
+    // }
 
-    auto in_var = ctx.InputVar("X");
-    auto max_norm = ctx.Attr<float>("max_norm");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    // auto in_var = ctx.InputVar("X");
+    // auto max_norm = ctx.Attr<float>("max_norm");
+    // auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-    if (in_var->IsType<phi::DenseTensor>()) {
-      auto* x = ctx.Input<phi::DenseTensor>("X");
-      auto* y = ctx.Output<phi::DenseTensor>("Out");
-      return phi::ClipByNormKernel<T>(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *x,
-          max_norm,
-          y);
-    } else if (in_var->IsType<phi::SelectedRows>()) {
-      auto* x = ctx.Input<phi::SelectedRows>("X");
-      phi::SelectedRows* output_selected_rows =
-          ctx.Output<phi::SelectedRows>("Out");
-      return phi::sr::ClipByNormKernel<T>(
-          static_cast<const typename framework::ConvertToPhiContext<
-              DeviceContext>::TYPE&>(dev_ctx),
-          *x,
-          max_norm,
-          output_selected_rows);
-    }
+    // if (in_var->IsType<phi::DenseTensor>()) {
+    //   auto* x = ctx.Input<phi::DenseTensor>("X");
+    //   auto* y = ctx.Output<phi::DenseTensor>("Out");
+    //   return phi::ClipByNormKernel<T>(
+    //       static_cast<const typename framework::ConvertToPhiContext<
+    //           DeviceContext>::TYPE&>(dev_ctx),
+    //       *x,
+    //       max_norm,
+    //       y);
+    // } else if (in_var->IsType<phi::SelectedRows>()) {
+    //   auto* x = ctx.Input<phi::SelectedRows>("X");
+    //   phi::SelectedRows* output_selected_rows =
+    //       ctx.Output<phi::SelectedRows>("Out");
+    //   return phi::sr::ClipByNormKernel<T>(
+    //       static_cast<const typename framework::ConvertToPhiContext<
+    //           DeviceContext>::TYPE&>(dev_ctx),
+    //       *x,
+    //       max_norm,
+    //       output_selected_rows);
+    // }
   };
 };
 
diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu
index ced7b82cb3a9f5..21406abff8d9f2 100644
--- a/paddle/fluid/operators/sync_batch_norm_op.cu
+++ b/paddle/fluid/operators/sync_batch_norm_op.cu
@@ -15,6 +15,7 @@
 #include "paddle/fluid/operators/sync_batch_norm_utils.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/sync_batch_norm_kernel.h"
 
@@ -236,26 +237,28 @@ void SyncBatchNormCooKernel(const Context& dev_ctx,
                             DenseTensor* saved_mean,
                             DenseTensor* saved_variance,
                             DenseTensor* reserve_space) {
-  EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
-  phi::SyncBatchNormKernel<T, Context>(dev_ctx,
-                                       x.values(),
-                                       mean,
-                                       variance,
-                                       scale,
-                                       bias,
-                                       is_test,
-                                       momentum,
-                                       epsilon,
-                                       data_layout,
-                                       use_global_stats,
-                                       trainable_statistics,
-                                       y->mutable_values(),
-                                       mean_out,
-                                       variance_out,
-                                       saved_mean,
-                                       saved_variance,
-                                       reserve_space);
-  y->SetIndicesDict(x.GetIndicesDict());
+      PADDLE_ENFORCE(false, "error");
+
+  // EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
+  // phi::SyncBatchNormKernel<T, Context>(dev_ctx,
+  //                                      x.values(),
+  //                                      mean,
+  //                                      variance,
+  //                                      scale,
+  //                                      bias,
+  //                                      is_test,
+  //                                      momentum,
+  //                                      epsilon,
+  //                                      data_layout,
+  //                                      use_global_stats,
+  //                                      trainable_statistics,
+  //                                      y->mutable_values(),
+  //                                      mean_out,
+  //                                      variance_out,
+  //                                      saved_mean,
+  //                                      saved_variance,
+  //                                      reserve_space);
+  // y->SetIndicesDict(x.GetIndicesDict());
 }
 
 template <typename T, typename Context>
@@ -277,26 +280,27 @@ void SyncBatchNormCooGradKernel(
     SparseCooTensor* x_grad,
     DenseTensor* scale_grad,
     DenseTensor* bias_grad) {
-  EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
-  *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
-  *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
-  phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
-                                           x.values(),
-                                           scale,
-                                           bias,
-                                           saved_mean,
-                                           saved_variance,
-                                           reserve_space,
-                                           y_grad.values(),
-                                           momentum,
-                                           epsilon,
-                                           data_layout,
-                                           is_test,
-                                           use_global_stats,
-                                           trainable_statistics,
-                                           x_grad->mutable_values(),
-                                           scale_grad,
-                                           bias_grad);
+      PADDLE_ENFORCE(false, "error");
+  // EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
+  // *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
+  // *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
+  // phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
+  //                                          x.values(),
+  //                                          scale,
+  //                                          bias,
+  //                                          saved_mean,
+  //                                          saved_variance,
+  //                                          reserve_space,
+  //                                          y_grad.values(),
+  //                                          momentum,
+  //                                          epsilon,
+  //                                          data_layout,
+  //                                          is_test,
+  //                                          use_global_stats,
+  //                                          trainable_statistics,
+  //                                          x_grad->mutable_values(),
+  //                                          scale_grad,
+  //                                          bias_grad);
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 703ca719902ac7..7cc881da923230 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -19,8 +19,6 @@ if(WITH_ROCM)
   list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu")
 if(WITH_MUSA)
   list(REMOVE_ITEM func_cu_srcs 
-      "cross_entropy.cu" 
-      "gru_compute.cu"
       "softmax.cu")
 endif()
 
diff --git a/test/collective/fleet/test_dygraph_sharding_stage2.py b/test/collective/fleet/test_dygraph_sharding_stage2.py
index 39eeaa177cf17d..49600c497371e7 100644
--- a/test/collective/fleet/test_dygraph_sharding_stage2.py
+++ b/test/collective/fleet/test_dygraph_sharding_stage2.py
@@ -13,20 +13,21 @@
 # limitations under the License.
 
 import unittest
-
+import sys
+sys.path.append("/workspace/Paddle/test")
 from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus
 
 
 class TestDygraphShardingStage2(TestMultipleGpus):
     # check sharding logic as well as the accuracy with single mode
     def test_dygraph_sharding_stage2(self):
-        self.run_mnist_2gpu('dygraph_group_sharded_stage2.py')
+        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/dygraph_group_sharded_stage2.py')
 
     def test_dygraph_sharding_stage2_offload(self):
-        self.run_mnist_2gpu('dygraph_group_sharded_stage2_offload.py')
+        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/dygraph_group_sharded_stage2_offload.py')
 
     def test_dygraph_sharding_stage2_with_comm_overlap(self):
-        self.run_mnist_2gpu('dygraph_group_sharded_stage2_comm_overlap.py')
+        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py')
 
 
 if __name__ == "__main__":
diff --git a/test/collective/fleet/test_parallel_dygraph_mp_layers.py b/test/collective/fleet/test_parallel_dygraph_mp_layers.py
index 6dc84b9dcb0e67..81d0c06c0f8a0b 100644
--- a/test/collective/fleet/test_parallel_dygraph_mp_layers.py
+++ b/test/collective/fleet/test_parallel_dygraph_mp_layers.py
@@ -13,13 +13,14 @@
 # limitations under the License.
 
 import unittest
-
+import sys
+sys.path.append("/workspace/Paddle/test")
 from legacy_test.test_parallel_dygraph_dataparallel import TestMultipleGpus
 
 
 class TestModelParallelLayer(TestMultipleGpus):
     def test_hybrid_parallel_mp_layer(self):
-        self.run_mnist_2gpu('hybrid_parallel_mp_layers.py')
+        self.run_mnist_2gpu('/workspace/Paddle/test/collective/fleet/hybrid_parallel_mp_layers.py')
 
 
 if __name__ == "__main__":
diff --git a/test/legacy_test/test_adaptive_avg_pool1d.py b/test/legacy_test/test_adaptive_avg_pool1d.py
index bca37ba88794f8..4f0ee276f93dea 100644
--- a/test/legacy_test/test_adaptive_avg_pool1d.py
+++ b/test/legacy_test/test_adaptive_avg_pool1d.py
@@ -127,6 +127,7 @@ def check_adaptive_avg_static_results(self, place):
             np.testing.assert_allclose(fetches[0], result_np, rtol=1e-05)
 
     def test_adaptive_avg_pool1d(self):
+        paddle.enable_static()
         for place in self.places:
             self.check_adaptive_avg_dygraph_results(place)
             self.check_adaptive_avg_static_results(place)
diff --git a/test/legacy_test/test_yolov3_loss_op.py b/test/legacy_test/test_yolov3_loss_op.py
index 61984ffce1a56e..e39c7ca94443c4 100644
--- a/test/legacy_test/test_yolov3_loss_op.py
+++ b/test/legacy_test/test_yolov3_loss_op.py
@@ -21,7 +21,7 @@
 import paddle
 from paddle.base import core
 from paddle.pir_utils import test_with_pir_api
-
+paddle.enable_static()
 
 def l1loss(x, y):
     return abs(x - y)
@@ -443,6 +443,7 @@ def test_dygraph(self):
 class TestYolov3LossStatic(unittest.TestCase):
     @test_with_pir_api
     def test_static(self):
+        paddle.enable_static()
         x = paddle.static.data('x', [2, 14, 8, 8], 'float32')
         gt_box = paddle.static.data('gt_box', [2, 10, 4], 'float32')
         gt_label = paddle.static.data('gt_label', [2, 10], 'int32')

From 234ae3dde3ad97256a77d0db9b7d7073686d35eb Mon Sep 17 00:00:00 2001
From: "haowen.han@mthreads.com" <haowen.han@mthreads.com>
Date: Mon, 13 May 2024 18:00:07 +0800
Subject: [PATCH 5/6] delete useless cout in
 ../paddle/phi/backends/gpu/musa/musa_info.cc and set compute capacity to 9.9
 for UT

---
 paddle/phi/backends/gpu/musa/musa_info.cc | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)

diff --git a/paddle/phi/backends/gpu/musa/musa_info.cc b/paddle/phi/backends/gpu/musa/musa_info.cc
index b7f5457415bf80..cab81b58f5ecb2 100644
--- a/paddle/phi/backends/gpu/musa/musa_info.cc
+++ b/paddle/phi/backends/gpu/musa/musa_info.cc
@@ -35,7 +35,7 @@ int DnnVersion() {
   // TODO(@caizhi): mudnnGetVersion is not supported now.
   // version info will be returned from mudnnGetVersion later.
   const int version_major = 2;
-  const int version_minor = 3;
+  const int version_minor = 5;
   const int version_patch = 0;
   return version_major * 1000 + version_minor * 100 + version_patch;
 }
@@ -242,7 +242,9 @@ const gpuDeviceProp &GetDeviceProperties(int id) {
     PADDLE_ENFORCE_GPU_SUCCESS(
         musaGetDeviceProperties(&g_device_props[id], id));
   });
-
+  //TODO@mtai:we hope not to skip UT that ask compute capacity to be greater than 7/8
+  g_device_props[id].major = 9;
+  g_device_props[id].minor = 9;
   return g_device_props[id];
 }
 
@@ -262,26 +264,14 @@ void GpuMemcpyAsync(void *dst,
                     size_t count,
                     gpuMemcpyKind kind,
                     gpuStream_t stream) {
-  // PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
-  std::cout<<"in GpuMemcpyAsync dst:"<<dst<<std::endl;
-  std::cout<<"in GpuMemcpyAsync src:"<<src<<std::endl;  
-  std::cout<<"in GpuMemcpyAsync count:"<<count<<std::endl;  
-  std::cout<<"in GpuMemcpyAsync kind:"<<kind<<std::endl;
-  musaError_t res = musaMemcpyAsync(dst, src, count, kind, stream);
-  std::cout<<"in GpuMemcpyAsync res:"<<res<<std::endl;  
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpyAsync(dst, src, count, kind, stream));
 }
 
 void GpuMemcpySync(void *dst,
                    const void *src,
                    size_t count,
                    gpuMemcpyKind kind) {
-  // PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
-  std::cout<<"in GpuMemcpySync dst:"<<dst<<std::endl;
-  std::cout<<"in GpuMemcpySync src:"<<src<<std::endl;  
-  std::cout<<"in GpuMemcpySync count:"<<count<<std::endl;  
-  std::cout<<"in GpuMemcpySync kind:"<<kind<<std::endl;
-  musaError_t res = musaMemcpy(dst, src, count, kind);
-  std::cout<<"in GpuMemcpySync res:"<<res<<std::endl;
+  PADDLE_ENFORCE_GPU_SUCCESS(musaMemcpy(dst, src, count, kind));
 
 }
 

From 15cf2f37ed310cc0c3aee40da110aa29b8647b6f Mon Sep 17 00:00:00 2001
From: "haowen.han@mthreads.com" <haowen.han@mthreads.com>
Date: Mon, 20 May 2024 08:19:31 +0000
Subject: [PATCH 6/6] fix some bugs when upgrading to v2.6.1

---
 paddle/phi/kernels/CMakeLists.txt       | 2 ++
 paddle/phi/kernels/funcs/CMakeLists.txt | 1 +
 paddle/phi/kernels/gpu/unique_kernel.cu | 2 ++
 3 files changed, 5 insertions(+)

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 6551d4e9b8c74d..6432dc19f768e9 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -77,6 +77,8 @@ if(WITH_MUSA)
     "selected_rows/gpu/clip_by_norm_kernel.cu"
     "gpu/softmax_grad_kernel.cu"
     "gpu/softmax_kernel.cu"
+    "gpu/put_along_axis_grad_kernel.cu"
+    "gpu/put_along_axis_kernel.cu"
     )
 endif()
 
diff --git a/paddle/phi/kernels/funcs/CMakeLists.txt b/paddle/phi/kernels/funcs/CMakeLists.txt
index 7cc881da923230..3b63d4f2ab407a 100644
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -17,6 +17,7 @@ endif()
 
 if(WITH_ROCM)
   list(REMOVE_ITEM func_cu_srcs "weight_only_gemv.cu")
+endif()  
 if(WITH_MUSA)
   list(REMOVE_ITEM func_cu_srcs 
       "softmax.cu")
diff --git a/paddle/phi/kernels/gpu/unique_kernel.cu b/paddle/phi/kernels/gpu/unique_kernel.cu
index ae50fec2ca1af5..c37c8a820aefa9 100644
--- a/paddle/phi/kernels/gpu/unique_kernel.cu
+++ b/paddle/phi/kernels/gpu/unique_kernel.cu
@@ -28,6 +28,8 @@
 
 #ifdef PADDLE_WITH_CUDA
 #include "cub/cub.cuh"
+#elif defined(PADDLE_WITH_MUSA)
+#include "cub/cub.cuh"
 #else
 #include <hipcub/hipcub.hpp>
 namespace cub = hipcub;