From cfa48d9e7160f762bad212b479093638e5f2c594 Mon Sep 17 00:00:00 2001 From: GhostScreaming Date: Wed, 28 Sep 2022 08:18:01 +0000 Subject: [PATCH 1/5] Add introduction for sequence_parallel in README. --- projects/gpt/docs/README.md | 2 ++ projects/gpt/docs/hybrid_parallel.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/gpt/docs/README.md b/projects/gpt/docs/README.md index e1afc13c1..7d5108251 100644 --- a/projects/gpt/docs/README.md +++ b/projects/gpt/docs/README.md @@ -171,6 +171,7 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, no_recompute_layers: fused_linear: True fuse_attn_qkv: True + sequence_parallel: False ``` 其中参数对应的释义如下: @@ -192,6 +193,7 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, |no_recompute_layers| list of integer,标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐,但是会适当的增加显存。若训练中发现有显存富裕,可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误,可以适当减小不进行recompute的层数。 | | fused_linear | 是否使用fused_linear代替传统Linear加速训练。注:该功能需要cuda 11.6及以上编译的paddle支持。 | | fuse_attn_qkv | 是否对attention层中的qkv计算使用fuse策略以加速训练 | +| sequence_parallel | 是否使用sequence_parallel策略以加速训练。注:只有混合并行的GPT才支持该功能,它与模型并行共用通信组,当mp=1时,序列并行策略会被强制关闭。 | | virtual_pp_degree | 虚拟流水线并行维度,该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯,所以该参数的推荐值为2。并且,只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时,才可以使用虚拟流水线并行。 | ### 数据集 diff --git a/projects/gpt/docs/hybrid_parallel.md b/projects/gpt/docs/hybrid_parallel.md index e79d36207..bc2db1cca 100644 --- a/projects/gpt/docs/hybrid_parallel.md +++ b/projects/gpt/docs/hybrid_parallel.md @@ -53,7 +53,7 @@ ### 策略支持 -飞桨的混合并行技术包括4个维度:数据并行、张量模型并行、流水线并行和分组切片并行,此外还支持重计算、offload、混合精度等策略,来减少显存占用、加速训练。 +飞桨的混合并行技术包括4个维度:数据并行、张量模型并行、流水线并行和分组切片并行,此外还支持重计算、offload、混合精度、序列并行(依赖于张量模型并行)等策略,来减少显存占用、加速训练。 目前,GPT模型训练已支持前3个维度的任意策略组合,但分组切片并行stage2/3仅支持与数据并行策略组合使用;详见下表。 From 3a153871dab389cbd833ccdbc08474c2b9f091a4 Mon Sep 17 00:00:00 2001 From: GhostScreaming Date: Wed, 28 Sep 2022 08:35:13 +0000 Subject: [PATCH 2/5] Add introduction for sequence_parallel in README. --- projects/gpt/docs/README.md | 2 +- projects/gpt/docs/hybrid_parallel.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/gpt/docs/README.md b/projects/gpt/docs/README.md index 7d5108251..a472feb6d 100644 --- a/projects/gpt/docs/README.md +++ b/projects/gpt/docs/README.md @@ -193,7 +193,7 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, |no_recompute_layers| list of integer,标识哪些层的transformer不需要进行recompute。所有在该list中的值应该 >= 0 同时应该 < num_layers。向该参数中增加不进行recompute 的层数可以提升模型训练的整体吞吐,但是会适当的增加显存。若训练中发现有显存富裕,可以适当增加不进行recompute的层数。如果使用该参数后出现OOM错误,可以适当减小不进行recompute的层数。 | | fused_linear | 是否使用fused_linear代替传统Linear加速训练。注:该功能需要cuda 11.6及以上编译的paddle支持。 | | fuse_attn_qkv | 是否对attention层中的qkv计算使用fuse策略以加速训练 | -| sequence_parallel | 是否使用sequence_parallel策略以加速训练。注:只有混合并行的GPT才支持该功能,它与模型并行共用通信组,当mp=1时,序列并行策略会被强制关闭。 | +| sequence_parallel | 是否使用序列并行策略以加速训练。注:只有混合并行的GPT才支持该功能,它与张量模型并行共用通信组,当mp_degree=1时,序列并行策略会被强制关闭。 | | virtual_pp_degree | 虚拟流水线并行维度,该参数会减小流水线bubble的占比以提升流水线的吞吐。但是该参数会增加流水线间的通讯,所以该参数的推荐值为2。并且,只有 num_layers可以被 pp_degree * virtual_pp_degree 整除时,才可以使用虚拟流水线并行。 | ### 数据集 diff --git a/projects/gpt/docs/hybrid_parallel.md b/projects/gpt/docs/hybrid_parallel.md index bc2db1cca..b6fa4a5db 100644 --- a/projects/gpt/docs/hybrid_parallel.md +++ b/projects/gpt/docs/hybrid_parallel.md @@ -53,7 +53,7 @@ ### 策略支持 -飞桨的混合并行技术包括4个维度:数据并行、张量模型并行、流水线并行和分组切片并行,此外还支持重计算、offload、混合精度、序列并行(依赖于张量模型并行)等策略,来减少显存占用、加速训练。 +飞桨的混合并行技术包括4个维度:数据并行、张量模型并行、流水线并行和分组切片并行,此外还支持重计算、offload、混合精度、序列并行等策略,来减少显存占用、加速训练。 目前,GPT模型训练已支持前3个维度的任意策略组合,但分组切片并行stage2/3仅支持与数据并行策略组合使用;详见下表。 From 088c44dec9a742b6b4c87b20a5333fc14c6bd302 Mon Sep 17 00:00:00 2001 From: GhostScreaming Date: Tue, 25 Oct 2022 04:06:12 +0000 Subject: [PATCH 3/5] Set sequence_parallel True in default. --- projects/gpt/docs/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/gpt/docs/README.md b/projects/gpt/docs/README.md index a472feb6d..db841aa21 100644 --- a/projects/gpt/docs/README.md +++ b/projects/gpt/docs/README.md @@ -171,7 +171,7 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, no_recompute_layers: fused_linear: True fuse_attn_qkv: True - sequence_parallel: False + sequence_parallel: True ``` 其中参数对应的释义如下: From 4f797c17ad5c7bb8befa31c5d2fbf0080895414e Mon Sep 17 00:00:00 2001 From: GhostScreaming Date: Fri, 28 Oct 2022 12:30:37 +0000 Subject: [PATCH 4/5] Set sequence_parallel false in default. --- ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml | 2 +- projects/gpt/docs/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index 82f5e2a49..e9c6542f7 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -32,7 +32,7 @@ Model: name: "GPT" fused_linear: False fuse_attn_qkv: True - sequence_parallel: True + sequence_parallel: False Data: diff --git a/projects/gpt/docs/README.md b/projects/gpt/docs/README.md index db841aa21..a472feb6d 100644 --- a/projects/gpt/docs/README.md +++ b/projects/gpt/docs/README.md @@ -171,7 +171,7 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, no_recompute_layers: fused_linear: True fuse_attn_qkv: True - sequence_parallel: True + sequence_parallel: False ``` 其中参数对应的释义如下: From a350d524d6558d2f88fdcb24e9137f227e62d380 Mon Sep 17 00:00:00 2001 From: GhostScreaming Date: Mon, 13 Mar 2023 11:57:42 +0000 Subject: [PATCH 5/5] Support sequence_parallel with main_grad, it may cause error when use sequence_parallel, main_grad and gradient_merge. --- .../gpt/dygraph/hybrid_model.py | 2 +- .../gpt/dygraph/sequence_parallel_utils.py | 23 +++++++++++-------- ppfleetx/utils/config.py | 6 +++++ 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index eef18fc8d..ec68bfe32 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -771,7 +771,7 @@ def __init__(self, hcg = env.get_hcg() mp_size = hcg.get_model_parallel_world_size() - if use_flash_attn or mp_size <= 1: + if mp_size <= 1: sequence_parallel = False logging.warning( "If mp_size <= 1, sequence_parallel strategy will be turned off in GPTModelHybrid model." diff --git a/ppfleetx/models/language_model/gpt/dygraph/sequence_parallel_utils.py b/ppfleetx/models/language_model/gpt/dygraph/sequence_parallel_utils.py index 30d78bffb..e4de80a12 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/sequence_parallel_utils.py +++ b/ppfleetx/models/language_model/gpt/dygraph/sequence_parallel_utils.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np import paddle from paddle import framework from paddle import distributed as dist @@ -169,18 +170,19 @@ def __impl__(grad): return __impl__ -def create_non_fused_allreduce_gradient_hook(accumulation_steps): +def create_non_fused_allreduce_gradient_hook(param, accumulation_steps): hcg = env.get_hcg() pg = hcg.get_model_parallel_group().process_group - step = [0] - def __impl__(grad): + @paddle.autograd.no_grad() + def __impl__(): step[0] += 1 - if step[0] == accumulation_steps: - step[0] = 0 - pg.allreduce(grad).wait() - return grad + if (step[0] % accumulation_steps) == 0: + if hasattr(param, "main_grad"): + pg.allreduce(param.main_grad).wait() + else: + pg.allreduce(param.grad).wait() return __impl__ @@ -202,11 +204,12 @@ def register_sequence_parallel_allreduce_hooks( if fuse_sequence_parallel_allreduce: hook = create_fused_allreduce_gradient_hook(params, accumulation_steps) for p in params: - p.register_hook(hook) + p._register_backward_hook(hook) else: for p in params: - p.register_hook( - create_non_fused_allreduce_gradient_hook(accumulation_steps)) + hook = create_non_fused_allreduce_gradient_hook(p, + accumulation_steps) + p._register_backward_hook(hook) def is_fused_matmul_bias_supported(): diff --git a/ppfleetx/utils/config.py b/ppfleetx/utils/config.py index c51529633..2f3188558 100644 --- a/ppfleetx/utils/config.py +++ b/ppfleetx/utils/config.py @@ -94,6 +94,12 @@ def process_dist_config(configs): if 'fuse_sequence_parallel_allreduce' not in config: config['fuse_sequence_parallel_allreduce'] = False + if 'use_main_grad' in config and config['use_main_grad'] is True: + logger.warning( + "If use_main_grad is True, fuse_sequence_parallel_allreduce will be forced to False" + ) + config['fuse_sequence_parallel_allreduce'] = False + def process_global_configs(config): """