add dsv3 64gpu sft json && solve OOM problem by offload (#11112)

deepllz · web-flow · commit ee4968b444f6 · 2025-09-24T20:05:58.000+08:00
diff --git a/llm/config/deepseek-v3/sft_argument_64gpu.json b/llm/config/deepseek-v3/sft_argument_64gpu.json
@@ -0,0 +1,57 @@
+{
+    "model_name_or_path": "/root/paddlejob/tmpspace/huggingface_model/huggingface/deepseek-ai/DeepSeek-V3-bf16/",
+    "dataset_name_or_path": "./data_small",
+    "output_dir": "./checkpoints/sft_ckpts",
+    "per_device_train_batch_size": 1,
+    "gradient_accumulation_steps": 8,
+    "per_device_eval_batch_size": 1,
+    "eval_accumulation_steps": 1,
+    "max_steps": 100,
+    "max_grad_norm": 1.0,
+    "amp_master_grad": true,
+    "num_train_epochs": 1,
+    "learning_rate": 2.2e-05,
+    "aux_loss_alpha": 0.0001,
+    "warmup_steps": 30,
+    "logging_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "no",
+    "src_length": 2048,
+    "max_length": 4097,
+    "bf16": true,
+    "fp16_opt_level": "O2",
+    "do_train": true,
+    "do_eval": false,
+    "disable_tqdm": true,
+    "use_expert_parallel": true,
+    "expert_parallel_degree": 8,
+    "continue_training": false,
+    "pipeline_parallel_config": "enable_delay_scale_loss disable_partial_send_recv disable_batch_p2p_comm",
+    "tensor_parallel_config": "sync_param sync_grad",
+    "sharding_parallel_config": "split_param",
+    "load_best_model_at_end": true,
+    "eval_with_do_generation": false,
+    "metric_for_best_model": "loss",
+    "recompute": true,
+    "recompute_use_reentrant": true,
+    "recompute_granularity": "full",
+    "save_total_limit": 1,
+    "tensor_parallel_degree": 4,
+    "pipeline_parallel_degree": 8,
+    "sharding_parallel_degree": 2,
+    "sharding": "stage1",
+    "zero_padding": true,
+    "unified_checkpoint": true,
+    "use_flash_attention": true,
+    "flash_mask": true,
+    "using_fake_gate": true,
+    "using_flex_token": true,
+    "use_fused_rms_norm": true,
+    "moe_subbatch_token_num": 0,
+    "recompute_offload": true,
+    "pre_alloc_memory": 70,
+    "tensorwise_offload_optimizer": true,
+    "sequence_parallel": true,
+    "tensor_parallel_output": true
+  }
+  
diff --git a/llm/run_finetune.py b/llm/run_finetune.py
@@ -268,6 +268,7 @@ def main():
     model_config.moe_subbatch_token_num = model_args.moe_subbatch_token_num
     model_config.aux_loss_alpha = model_args.aux_loss_alpha
     model_config.gradient_accumulation_steps = training_args.gradient_accumulation_steps
+    model_config.recompute_offload = training_args.recompute_offload
     logger.info(f"Final model config: {model_config}")
 
     logger.info("Creating model")
diff --git a/llm/script/kill_process.sh b/llm/script/kill_process.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+set -x
+
+SCRIPT_DIR=`dirname "$0"`
+LAUNCH_SCRIPT="$SCRIPT_DIR/selective_launch.py"
+
+if [[ -f "$LAUNCH_SCRIPT" ]]; then
+    LAUNCH_CMD=`python "$LAUNCH_SCRIPT" 36677`
+    if [[ -z "$LAUNCH_CMD" ]]; then
+        exit 0
+    fi
+fi
+
+skip_kill_time=${1:-"False"}
+
+function kill_impl() {
+    skip_kill_time=$1
+    if [[ $skip_kill_time == "True" ]];then
+        for((i=1;i<=60;i++));
+        do
+            pids=`ps -ef | grep 'time_2023_8888.py' | grep -v grep | awk '{print $2}'`
+            if [[ "$pids" == "" ]] ; then
+                echo "no process found for speed-testing. stop waiting and kill other scripts."
+                break
+            fi
+            echo "wait 10 seconds for finishing the speed-testing scripts."
+            sleep 10s
+        done
+    fi
+
+    # kill aadiff test finally.
+    ps -ef | grep -E "check_aadiff.sh|run_aadiff_matmul.sh|test_matmul.py" | awk '{print $2}' | xargs kill -9
+
+    pids=`ps -ef | grep train.py | grep -v grep | awk '{print $2}'`
+    if [[ "$pids" != "" ]] ; then
+        echo $pids
+        echo $pids | xargs kill -9
+    fi
+
+    # kill agent server
+    (ps -ef | grep agent | grep port | awk '{print $2}' | xargs -I {} kill -9  {}) || true
+
+    if [[ $TRAININGJOB_REPLICA_NAME == "trainer" ]]; then
+        echo "Killing processes on gpu"
+        lsof /dev/nvidia* | awk '{print $2}' | xargs -I {} kill -9 {}
+    elif [[ $TRAININGJOB_REPLICA_NAME == "trainerxpu" ]]; then
+        echo "Killing processes on xpu"
+        lsof /dev/xpu* | awk '{print $2}' | xargs -I {} kill -9 {}
+    else
+        echo "[FATAL] unsupported training job type: ${TRAININGJOB_REPLICA_NAME}"
+        exit 1
+    fi
+}
+
+kill_impl $skip_kill_time || true
diff --git a/llm/script/selective_launch.py b/llm/script/selective_launch.py
@@ -0,0 +1,59 @@
+"""
+Selective launch script.
+
+Usage: python script/selective_launch.py <port> <ranks> <ranks> <ranks> ...
+"""
+import os
+import sys
+
+
+def parse_ranks(ranks_strs):
+    """
+    parse_ranks
+    """
+    # NOTE: You can return ranks directly here to change script/train_gpu.sh
+    # and script/kill_process.sh together
+
+    # Example 1: Use contiguous nodes [8, 16)
+    # return range(8, 16)
+
+    # Example 2: Use non-contiguous nodes [4, 8) + {10} + [30, 32), i.e., [4, 5, 6, 7, 10, 30, 31]
+    # return list(range(4, 8)) + [10] + list(range(30, 32))
+
+    # Example 3:
+    # Just Python code, return any nodes you want!
+    return list(range(64, 72))
+    if not ranks_strs:
+        return None
+
+    ranks = []
+    for r in ranks_strs:
+        r = eval(r)
+        if isinstance(r, int):
+            ranks.append(r)
+        else:
+            ranks.extend(r)
+    return ranks
+
+
+def main(port, ranks):
+    """
+    main
+    """
+    ips = [ip.strip() for ip in os.getenv("TRAINER_INSTANCES").split(",") if ip.strip()]
+    if ranks is None:
+        ranks = list(range(len(ips)))
+    ranks = sorted(list(set(ranks)))
+    my_rank = int(os.getenv("POD_INDEX", "0"))
+    if my_rank not in ranks:
+        return
+
+    rank = ranks.index(my_rank)
+    nranks = len(ranks)
+
+    master = ips[ranks[0]]
+    print(f"--master {master}:{port} --rank {rank} --nnodes {nranks}")
+
+
+if __name__ == "__main__":
+    main(int(sys.argv[1]), parse_ranks(sys.argv[2:]))
diff --git a/llm/script/train_gpu.sh b/llm/script/train_gpu.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+unset PADDLE_ELASTIC_JOB_ID
+unset PADDLE_TRAINER_ENDPOINTS
+unset DISTRIBUTED_TRAINER_ENDPOINTS
+unset FLAGS_START_PORT
+unset PADDLE_ELASTIC_TIMEOUT
+
+nnodes=$PADDLE_TRAINERS_NUM
+rank=$PADDLE_TRAINER_ID
+
+for name in `env | grep -E 'PADDLE|ENDPOINT' | awk -F'=' '{print $1}'`; do
+  unset ${name}
+done
+
+#export FLAGS_shard_bypass_dygraph_optimizer=1
+export NCCL_IB_GID_INDEX=3
+export NVSHMEM_IB_GID_INDEX=3
+export NVSHMEM_IB_TRAFFIC_CLASS=162
+
+#export NVSHMEM_IB_ENABLE_IBGDA=true
+##export NVSHMEM_DISABLE_P2P=1
+export NVSHMEM_BOOTSTRAP=UID
+
+unset NVSHMEM_HCA_LIST 
+unset NVSHMEM_ENABLE_NIC_PE_MAPPING
+
+LAUNCH_CMD=`python script/selective_launch.py 36677`
+if [[ -z "$LAUNCH_CMD" ]]; then
+    exit 0
+fi
+
+export PYTHONPATH=../:$PYTHONPATH
+export CUDA_PATH=/usr/local/cuda-12.9
+
+export DSV3_USE_FP8_GEMM=true
+export DSV3_USE_ATTEN_RECOMPUTE=true
+export FA_VERSION=3
+export FLAGS_share_tensor_for_grad_tensor_holder=1
+export FLAGS_use_default_stream=false
+export DSV3_USE_FP8_DISPATCH=true
+export USE_DS_GEMM=false
+
+export NVJITLIB=/root/paddlejob/workspace/env_run/zhengzhonghui/venv/lib/python3.10/site-packages/nvidia/nvjitlink/lib/
+export CUSPARSELIB=/root/paddlejob/workspace/env_run/zhengzhonghui/venv/lib/python3.10/site-packages/nvidia/cusparse/lib
+export LD_LIBRARY_PATH=$NVJITLIB:$CUSPARSELIB:$LD_LIBRARY_PATH
+
+source /root/paddlejob/workspace/env_run/zhengzhonghui/venv/bin/activate
+# source /root/paddlejob/workspace/env_run/chenzhichao/venv/bin/activate
+bash script/kill_process.sh 
+
+rm core.* -rf
+
+python3.10 -m paddle.distributed.launch \
+    --log_dir output/paddle_distributed_logs \
+    $LAUNCH_CMD \
+    --run_mode=collective \
+    ${script:-run_finetune.py}  \
+    $@
diff --git a/paddlenlp/trainer/utils/offload_optimizer.py b/paddlenlp/trainer/utils/offload_optimizer.py
@@ -48,6 +48,14 @@ def new_add_accumulator(self, *args, **kwargs):
 
     setattr(Optimizer, "_add_accumulator", new_add_accumulator)
 
+    origin_create_master_weight = getattr(Optimizer, "_create_master_weight")
+    def new_create_master_weight(self, *args, **kwargs):
+        x = origin_create_master_weight(self, *args, **kwargs)
+        offload(x)
+        return x
+
+    setattr(Optimizer, "_create_master_weight", new_create_master_weight)
+
     # Step 2: mock _C_ops.adamw_ and _C_ops.adamw
     for name in ["adam_", "adamw_"]:
         origin_op = getattr(_C_ops, name)
diff --git a/paddlenlp/transformers/deepseek_v2/modeling_pp.py b/paddlenlp/transformers/deepseek_v2/modeling_pp.py
@@ -235,6 +235,9 @@ def forward(self, args):
                 attn_mask_startend_row_indices=attn_mask_startend_row_indices,
             )
         elif self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
+            offload_kwargs = {}
+            if self.config.recompute_offload:
+                offload_kwargs["offload_indices"] = [0]
             if attention_mask is not None or attn_mask_startend_row_indices is not None:
                 hidden_states = recompute(
                     super().forward,
@@ -243,6 +246,7 @@ def forward(self, args):
                     attention_mask=attention_mask,
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                     use_reentrant=self.config.recompute_use_reentrant,
+                    **offload_kwargs,
                 )
             else:
                 # for pretrain
@@ -300,6 +304,9 @@ def forward(self, args):
                     attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                 )
             elif self.enable_recompute and self.config.recompute_granularity == "full" and has_gradient:
+                offload_kwargs = {}
+                if self.config.recompute_offload:
+                    offload_kwargs["offload_indices"] = [0]
                 if attention_mask is not None or attn_mask_startend_row_indices is not None:
                     hidden_states = recompute(
                         super().forward,
@@ -309,6 +316,7 @@ def forward(self, args):
                         attention_mask=attention_mask,
                         attn_mask_startend_row_indices=attn_mask_startend_row_indices,
                         use_reentrant=self.config.recompute_use_reentrant,
+                        **offload_kwargs,
                     )
                 else:
                     # for pretrain
diff --git a/paddlenlp/transformers/deepseek_v3/modeling.py b/paddlenlp/transformers/deepseek_v3/modeling.py
@@ -88,6 +88,7 @@ def forward(
         self,
         input_ids: paddle.Tensor = None,
         attention_mask: Optional[paddle.Tensor] = None,
+        attn_mask_startend_row_indices: Optional[paddle.Tensor] = None,
         position_ids: Optional[paddle.Tensor] = None,
         past_key_values: Optional[List[paddle.Tensor]] = None,
         inputs_embeds: Optional[paddle.Tensor] = None,
@@ -139,6 +140,7 @@ def forward(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
+            attn_mask_startend_row_indices=attn_mask_startend_row_indices,
         )
 
         hidden_states = outputs[0]
diff --git a/paddlenlp/trl/sft_config.py b/paddlenlp/trl/sft_config.py
@@ -63,6 +63,7 @@ class SFTConfig(TrainingArguments):
     model_init_kwargs: Optional[dict[str, Any]] = None
     dataset_kwargs: Optional[dict[str, Any]] = None
     eval_packing: Optional[bool] = None
+    recompute_offload: Optional[bool] = None
     use_ssa: bool = field(
         default=False,
         metadata={