1
+ {
2
+ "model_name_or_path": "/root/paddlejob/tmpspace/huggingface_model/huggingface/deepseek-ai/DeepSeek-V3-bf16/",
3
+ "dataset_name_or_path": "./data_small",
4
+ "output_dir": "./checkpoints/sft_ckpts",
5
+ "per_device_train_batch_size": 1,
6
+ "gradient_accumulation_steps": 8,
7
+ "per_device_eval_batch_size": 1,
8
+ "eval_accumulation_steps": 1,
9
+ "max_steps": 100,
10
+ "max_grad_norm": 1.0,
11
+ "amp_master_grad": true,
12
+ "num_train_epochs": 1,
13
+ "learning_rate": 2.2e-05,
14
+ "aux_loss_alpha": 0.0001,
15
+ "warmup_steps": 30,
16
+ "logging_steps": 1,
17
+ "evaluation_strategy": "no",
18
+ "save_strategy": "no",
19
+ "src_length": 2048,
20
+ "max_length": 4097,
21
+ "bf16": true,
22
+ "fp16_opt_level": "O2",
23
+ "do_train": true,
24
+ "do_eval": false,
25
+ "disable_tqdm": true,
26
+ "use_expert_parallel": true,
27
+ "expert_parallel_degree": 8,
28
+ "continue_training": false,
29
+ "pipeline_parallel_config": "enable_delay_scale_loss disable_partial_send_recv disable_batch_p2p_comm",
30
+ "tensor_parallel_config": "sync_param sync_grad",
31
+ "sharding_parallel_config": "split_param",
32
+ "load_best_model_at_end": true,
33
+ "eval_with_do_generation": false,
34
+ "metric_for_best_model": "loss",
35
+ "recompute": true,
36
+ "recompute_use_reentrant": true,
37
+ "recompute_granularity": "full",
38
+ "save_total_limit": 1,
39
+ "tensor_parallel_degree": 4,
40
+ "pipeline_parallel_degree": 8,
41
+ "sharding_parallel_degree": 2,
42
+ "sharding": "stage1",
43
+ "zero_padding": true,
44
+ "unified_checkpoint": true,
45
+ "use_flash_attention": true,
46
+ "flash_mask": true,
47
+ "using_fake_gate": true,
48
+ "using_flex_token": true,
49
+ "use_fused_rms_norm": true,
50
+ "moe_subbatch_token_num": 0,
51
+ "recompute_offload": true,
52
+ "pre_alloc_memory": 70,
53
+ "tensorwise_offload_optimizer": true,
54
+ "sequence_parallel": true,
55
+ "tensor_parallel_output": true
56
+ }
57
+
0 commit comments