From 5cf67439efcaa1a6fd67cfa43d392adbd5becd97 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Sat, 25 Feb 2023 17:23:50 +0800 Subject: [PATCH 01/25] Add the support of bfloat16 amp training. --- .../configs/nlp/gpt/pretrain_gpt_base.yaml | 3 ++ ppfleetx/core/engine/eager_engine.py | 36 +++++++++++++------ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index 6c286c152..e20b23de2 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -17,6 +17,8 @@ Engine: test_iters: mix_precision: use_pure_fp16: True + dtype: "float16" + level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -91,5 +93,6 @@ Profiler: profiler_log: profiler_log detailed: False + Distributed: fuse_sequence_parallel_allreduce: False diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 1589918ac..de2bd01c1 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -135,6 +135,8 @@ def configure_optimizers(self): logger.info("NOTE: disable use_pure_fp16 in export mode") self._use_pure_fp16 = False + self._amp_dtype = self._configs['mix_precision']['dtype'] + self._amp_level = self._configs['mix_precision']['level'] self._scale_loss = self._configs['mix_precision']['scale_loss'] self._custom_black_list = self._configs['mix_precision'][ 'custom_black_list'] @@ -177,14 +179,17 @@ def configure_optimizers(self): self._use_recompute = configs['Model']['use_recompute'] if self._use_pure_fp16: - if mode == 'train': + if mode == 'train' and self._amp_dtype == "float16": self._scaler = paddle.amp.GradScaler( init_loss_scaling=self._scale_loss) # Save dtype is the same as model dtype. Also can set save_dtype='float32' when # training with pure fp16 strategy, but will cause the rise of memory. - self._module.model = paddle.amp.decorate( - models=self._module.model, level='O2') + if self._amp_level == "O2": + self._module.model = paddle.amp.decorate( + models=self._module.model, + dtype=self._amp_dtype, + level=self._amp_level) else: self._scaler = None @@ -373,6 +378,8 @@ def _train_one_epoch(self, if self.profiler: self.profiler.step() + if step == 120: + return def fit(self, epoch=1, train_data_loader=None, valid_data_loader=None): """ @@ -456,10 +463,11 @@ def _fit_impl(self, batch): loss = self._model_forward_backward(batch) else: with paddle.amp.auto_cast( - self._use_pure_fp16, + enable=self._use_pure_fp16, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, - level='O2'): + dtype=self._amp_dtype, + level=self._amp_level): batch = self._module.model._prepare_training( batch, self._optimizer, self._lr_scheduler) loss = self._module.model.forward_backward_pipeline( @@ -485,10 +493,14 @@ def _model_forward_backward(self, batch): self._use_pure_fp16, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, - level='O2'): + dtype=self._amp_dtype, + level=self._amp_level): loss = self._module.training_step(micro_batch) - loss_bw = self._scaler.scale(loss) if self._use_pure_fp16 else loss + if self._use_pure_fp16 and self._amp_dtype == "float16": + loss_bw = self._scaler.scale(loss) + else: + loss_bw = loss if self._accumulate_steps > 1: # div the loss for backward loss_bw = loss_bw / self._accumulate_steps @@ -516,7 +528,7 @@ def _optim_update_params(self): p.bw_storage.scale_(1.0 / self._dp_group.nranks) dist.all_reduce(p.bw_storage, group=self._dp_group) - if self._use_pure_fp16: + if self._use_pure_fp16 and self._amp_dtype == "float16": self._scaler.step(self._optimizer) self._scaler.update() else: @@ -590,7 +602,8 @@ def _evaluate_impl(self, batch): self._use_pure_fp16, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, - level='O2'): + dtype=self._amp_dtype, + level=self._amp_level): if self._pp_degree == 1: loss = self._module.validation_step(batch) else: @@ -646,7 +659,8 @@ def _predict_impl(self, batch): self._use_pure_fp16, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, - level='O2'): + dtype=self._amp_dtype, + level=self._amp_level): if self._pp_degree == 1: loss = self._module.test_step(batch) else: @@ -849,6 +863,8 @@ def _profiler_done(self): logger.info("Profiler finished, prepare to print summary...") self.profiler.stop() + self.profiler.summary(op_detail=True) + return self._print_summary() profiler_log = self.profiler_config.get('profiler_log', From 52c7a833c503f5c781295360aa6cd7748b4eca7d Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Sat, 25 Feb 2023 18:18:27 +0800 Subject: [PATCH 02/25] Allow to use multi_precision to grad_clip. --- ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml | 1 + ppfleetx/optims/__init__.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index e20b23de2..7aa34e511 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -84,6 +84,7 @@ Optimizer: grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 + multi_precision: False tensor_fusion: False diff --git a/ppfleetx/optims/__init__.py b/ppfleetx/optims/__init__.py index 27976942c..8c5595fc3 100644 --- a/ppfleetx/optims/__init__.py +++ b/ppfleetx/optims/__init__.py @@ -43,6 +43,9 @@ def build_lr_scheduler(lr_config): def build_grad_clip(grad_clip_config): if grad_clip_config is not None: + multi_precision = grad_clip_config.pop('multi_precision', False) + if multi_precision: + paddle.nn.clip._clip_by_global_norm_using_mp_type(True) grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') grad_clip = eval(grad_clip_name)(**grad_clip_config) return grad_clip From 2f9a0397ef2d4d15cd4a0d03b11d0e3b58ea217d Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Mon, 27 Feb 2023 18:25:36 +0800 Subject: [PATCH 03/25] Applay GradScaler for bfloat16. --- ppfleetx/core/engine/eager_engine.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index de2bd01c1..54090452b 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -182,6 +182,9 @@ def configure_optimizers(self): if mode == 'train' and self._amp_dtype == "float16": self._scaler = paddle.amp.GradScaler( init_loss_scaling=self._scale_loss) + else: # bfloat16 + self._scaler = paddle.amp.GradScaler( + init_loss_scaling=1, use_dynamic_loss_scaling=False) # Save dtype is the same as model dtype. Also can set save_dtype='float32' when # training with pure fp16 strategy, but will cause the rise of memory. @@ -378,8 +381,6 @@ def _train_one_epoch(self, if self.profiler: self.profiler.step() - if step == 120: - return def fit(self, epoch=1, train_data_loader=None, valid_data_loader=None): """ @@ -497,10 +498,7 @@ def _model_forward_backward(self, batch): level=self._amp_level): loss = self._module.training_step(micro_batch) - if self._use_pure_fp16 and self._amp_dtype == "float16": - loss_bw = self._scaler.scale(loss) - else: - loss_bw = loss + loss_bw = self._scaler.scale(loss) if self._use_pure_fp16 else loss if self._accumulate_steps > 1: # div the loss for backward loss_bw = loss_bw / self._accumulate_steps @@ -528,7 +526,7 @@ def _optim_update_params(self): p.bw_storage.scale_(1.0 / self._dp_group.nranks) dist.all_reduce(p.bw_storage, group=self._dp_group) - if self._use_pure_fp16 and self._amp_dtype == "float16": + if self._use_pure_fp16: self._scaler.step(self._optimizer) self._scaler.update() else: @@ -863,8 +861,6 @@ def _profiler_done(self): logger.info("Profiler finished, prepare to print summary...") self.profiler.stop() - self.profiler.summary(op_detail=True) - return self._print_summary() profiler_log = self.profiler_config.get('profiler_log', From 2915f573d979c319725f9588d92ab9a801ad029c Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Tue, 28 Feb 2023 16:30:29 +0800 Subject: [PATCH 04/25] Fix ci error. --- ppfleetx/core/engine/eager_engine.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 54090452b..c6faa693c 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -130,18 +130,17 @@ def configure_optimizers(self): self._num_train_epochs = self._configs['num_train_epochs'] self._accumulate_steps = self._configs['accumulate_steps'] - self._use_pure_fp16 = self._configs['mix_precision']['use_pure_fp16'] + amp_config = self._configs['mix_precision'] + self._use_pure_fp16 = amp_config['use_pure_fp16'] if mode == 'export' and self._use_pure_fp16: logger.info("NOTE: disable use_pure_fp16 in export mode") self._use_pure_fp16 = False - self._amp_dtype = self._configs['mix_precision']['dtype'] - self._amp_level = self._configs['mix_precision']['level'] - self._scale_loss = self._configs['mix_precision']['scale_loss'] - self._custom_black_list = self._configs['mix_precision'][ - 'custom_black_list'] - self._custom_white_list = self._configs['mix_precision'][ - 'custom_white_list'] + self._amp_dtype = amp_config.get('dtype', 'float16') + self._amp_level = amp_config.get('level', 'O2') + self._scale_loss = amp_config('scale_loss') + self._custom_black_list = amp_config('custom_black_list') + self._custom_white_list = amp_config('custom_white_list') self._save_steps = self._configs['save_load']['save_steps'] self._save_epoch = self._configs['save_load']['save_epoch'] From 59546bab5792fa91e97acb63f528bc4b25309853 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Wed, 1 Mar 2023 14:11:09 +0800 Subject: [PATCH 05/25] Fix typo and add print of loss_scale. --- ppfleetx/core/engine/eager_engine.py | 8 +++++--- .../models/language_model/language_module.py | 17 ++++++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index c6faa693c..9cf5c9e89 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -138,9 +138,9 @@ def configure_optimizers(self): self._amp_dtype = amp_config.get('dtype', 'float16') self._amp_level = amp_config.get('level', 'O2') - self._scale_loss = amp_config('scale_loss') - self._custom_black_list = amp_config('custom_black_list') - self._custom_white_list = amp_config('custom_white_list') + self._scale_loss = amp_config['scale_loss'] + self._custom_black_list = amp_config['custom_black_list'] + self._custom_white_list = amp_config['custom_white_list'] self._save_steps = self._configs['save_load']['save_steps'] self._save_epoch = self._configs['save_load']['save_epoch'] @@ -334,6 +334,8 @@ def _train_one_epoch(self, 'loss': sum(numpy_losses) / len(numpy_losses), 'lr': self._optimizer.get_lr() } + if self._use_pure_fp16: + log_dict['loss_scale'] = self._scaler._scale self._module.training_step_end(log_dict) train_step_start = get_timestamp() diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index b50383d8f..f966422b0 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -75,11 +75,18 @@ def training_step_end(self, log_dict): default_global_tokens_num = self.configs.Global.global_batch_size * \ self.configs.Data.Train.dataset.max_seq_len - logger.info( - "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ - "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e" - % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed, - speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, log_dict['lr'])) + if log_dict.get('loss_scale', None) is not None: + logger.info( + "[train] epoch: %d, batch: %d, loss: %.9f, loss_scale: %.2f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ + "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e" + % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['loss_scale'], log_dict['train_cost'], speed, + speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, log_dict['lr'])) + else: + logger.info( + "[train] epoch: %d, batch: %d, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ + "ips_total: %.0f tokens/s, ips: %.0f tokens/s, learning rate: %.5e" + % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['train_cost'], speed, + speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, log_dict['lr'])) def validation_step(self, batch): tokens, position_ids, labels, loss_mask = batch From c91472dd663947652203d9ee420518fbec569c18 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Thu, 2 Mar 2023 11:01:00 +0000 Subject: [PATCH 06/25] fix format --- ppfleetx/models/language_model/language_module.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index 125504484..68658e3e3 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -75,10 +75,12 @@ def training_step_end(self, log_dict): default_global_tokens_num = self.configs.Global.global_batch_size * \ self.configs.Data.Train.dataset.max_seq_len - loss_scale_str = "loss_scale: %.9f, ".format(log_dict['loss_scale']) if log_dict.get('loss_scale', None) is not None else "" + loss_scale_str = "loss_scale: %.9f,".format(log_dict[ + 'loss_scale']) if log_dict.get('loss_scale', + None) is not None else "" logger.info( "[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ - "ips_total: %.0f tokens/s, ips: %.0f tokens/s, {}learning rate: %.5e" + "ips_total: %.0f tokens/s, ips: %.0f tokens/s, %s learning rate: %.5e" % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_step'], log_dict['loss'], log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, loss_scale_str, log_dict['lr'])) From 8a60a76a0e2e1c1f98bda2c3c0d8bfbdd837bd5c Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Fri, 3 Mar 2023 08:15:55 +0000 Subject: [PATCH 07/25] rename use_pure_fp16 as mix_precision.enable --- .../benchmark_common/run_benchmark.sh | 2 +- .../benchmark_common/run_benchmark.sh | 2 +- .../benchmark_common/run_benchmark.sh | 2 +- docs/standard.md | 4 ++-- .../transformer/models/GPT/docs/README.md | 4 ++-- .../finetune_gpt_345M_single_card_glue.yaml | 2 +- .../transformer/models/GPT/finetune/impls.py | 2 +- .../transformer/models/GPT/finetune/run.py | 6 ++--- .../configs/generation_gpt_base.yaml | 2 +- .../models/GPT/generation/export.py | 4 ++-- .../offline-eval/configs/eval_gpt_base.yaml | 2 +- .../models/GPT/offline-eval/impls.py | 2 +- .../models/GPT/offline-eval/run.py | 2 +- .../pretrain/configs/pretrain_gpt_base.yaml | 2 +- .../transformer/models/GPT/pretrain/export.py | 4 ++-- .../transformer/models/GPT/pretrain/impls.py | 6 ++--- .../transformer/models/GPT/pretrain/run.py | 4 ++-- .../pretrain_moe_345M_single_card.yaml | 2 +- .../configs/pretrain_moe_base.yaml | 2 +- .../models/GPT/pretrain_moe/impls.py | 8 +++---- .../models/GPT/pretrain_moe/run.py | 4 ++-- examples/transformer/utils/config.py | 2 +- .../multimodal/imagen/imagen_base.yaml | 2 +- .../imagen/imagen_super_resolution_1024.yaml | 2 +- .../nlp/ernie/auto/pretrain_ernie_base.yaml | 2 +- .../nlp/ernie/finetune_ernie_base.yaml | 2 +- .../nlp/ernie/pretrain_ernie_base.yaml | 2 +- .../configs/nlp/ernie/qat_ernie_base.yaml | 2 +- .../finetune_gpt_345M_single_card_glue.yaml | 2 +- .../configs/nlp/gpt/pretrain_gpt_base.yaml | 2 +- .../configs/nlp/moe/pretrain_moe_base.yaml | 2 +- .../vis/moco/moco_lincls_in1k_1n8c.yaml | 2 +- .../configs/vis/moco/mocov1_pt_in1k_1n8c.yaml | 2 +- .../configs/vis/moco/mocov2_pt_in1k_1n8c.yaml | 2 +- .../vit/ViT_base_patch16_224_inference.yaml | 2 +- ...e_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml | 2 +- ...e_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml | 2 +- ...h16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml | 2 +- ...tch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml | 2 +- ...e_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml | 2 +- ...tch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml | 2 +- ...patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml | 2 +- ppfleetx/core/engine/eager_engine.py | 24 +++++++++---------- ppfleetx/models/multimodal_model/utils.py | 2 +- ppfleetx/models/vision_model/moco/moco.py | 2 +- ppfleetx/utils/config.py | 4 ++-- projects/gpt/docs/README.md | 4 ++-- .../run_super_resolution_1024_sharding128.sh | 2 +- 48 files changed, 74 insertions(+), 74 deletions(-) diff --git a/benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh b/benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh index d94dd5856..2aa7bee8a 100644 --- a/benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh +++ b/benchmarks/test_tipc/ernie/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh @@ -87,7 +87,7 @@ function _train(){ -o Global.micro_batch_size=${micro_batch_size} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=${eval_freq} \ - -o Engine.mix_precision.use_pure_fp16=${use_pure_fp16} \ + -o Engine.mix_precision.enable=${use_pure_fp16} \ -o Engine.save_load.save_steps=100000 \ -o Model.hidden_size=1024 \ -o Model.num_hidden_layers=${num_layers} \ diff --git a/benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh b/benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh index f6b6af5b1..6832aa91e 100644 --- a/benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh +++ b/benchmarks/test_tipc/gpt/dygraph/hybrid_parallel/benchmark_common/run_benchmark.sh @@ -86,7 +86,7 @@ function _train(){ -o Global.micro_batch_size=${micro_batch_size} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=${eval_freq} \ - -o Engine.mix_precision.use_pure_fp16=${use_pure_fp16} \ + -o Engine.mix_precision.enable=${use_pure_fp16} \ -o Engine.save_load.save_steps=100000 \ -o Model.hidden_size=1024 \ -o Model.num_layers=${num_layers} \ diff --git a/benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh b/benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh index 8a0bc13d1..13dd8cd2e 100644 --- a/benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh +++ b/benchmarks/test_tipc/gpt/dygraph/sharding/benchmark_common/run_benchmark.sh @@ -81,7 +81,7 @@ function _train(){ -o Global.micro_batch_size=${micro_batch_size} \ -o Engine.max_steps=${max_iter} \ -o Engine.eval_freq=${eval_freq} \ - -o Engine.mix_precision.use_pure_fp16=${use_pure_fp16} \ + -o Engine.mix_precision.enable=${use_pure_fp16} \ -o Engine.save_load.save_steps=100000 \ -o Model.use_recompute=${use_recompute} \ -o Distributed.dp_degree=${dp_degree} \ diff --git a/docs/standard.md b/docs/standard.md index bfa9419f2..9aac894bd 100644 --- a/docs/standard.md +++ b/docs/standard.md @@ -102,7 +102,7 @@ Engine: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -123,7 +123,7 @@ Engine: | logging_freq | 训练日志打印的频率 | | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | -| use_pure_fp16 | 是否使用purefp16精度训练 | +| enable | 是否使用purefp16精度训练 | | scale_loss | 使用fp16精度下,loss的放缩比例 | | custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16计算。 | | custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16计算。| diff --git a/examples/transformer/models/GPT/docs/README.md b/examples/transformer/models/GPT/docs/README.md index 102ce6208..a6d32886c 100644 --- a/examples/transformer/models/GPT/docs/README.md +++ b/examples/transformer/models/GPT/docs/README.md @@ -102,7 +102,7 @@ cd .. # 回到 GPT 目录下 eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -128,7 +128,7 @@ cd .. # 回到 GPT 目录下 | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | | test_iters | 模型测试或推理时的轮数 | -| use_pure_fp16 | 是否使用purefp16精度训练 | +| enable | 是否使用purefp16精度训练 | | scale_loss | 使用fp16精度下,loss的放缩比例 | | custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16计算。 | | custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16计算。| diff --git a/examples/transformer/models/GPT/finetune/configs/finetune_gpt_345M_single_card_glue.yaml b/examples/transformer/models/GPT/finetune/configs/finetune_gpt_345M_single_card_glue.yaml index 6615f516a..1bb1a2910 100644 --- a/examples/transformer/models/GPT/finetune/configs/finetune_gpt_345M_single_card_glue.yaml +++ b/examples/transformer/models/GPT/finetune/configs/finetune_gpt_345M_single_card_glue.yaml @@ -11,7 +11,7 @@ Global: logging_freq: 10 eval_freq: 1 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "reduce_mean"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/examples/transformer/models/GPT/finetune/impls.py b/examples/transformer/models/GPT/finetune/impls.py index ccab9c9fb..d50d8791b 100644 --- a/examples/transformer/models/GPT/finetune/impls.py +++ b/examples/transformer/models/GPT/finetune/impls.py @@ -190,7 +190,7 @@ def fit_impl(config, batch, forward_func, **kwargs): def eval_impl(config, batch, model, loss_fn, eval_metric): model.eval() - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list diff --git a/examples/transformer/models/GPT/finetune/run.py b/examples/transformer/models/GPT/finetune/run.py index 102a27399..0b82f2bfe 100644 --- a/examples/transformer/models/GPT/finetune/run.py +++ b/examples/transformer/models/GPT/finetune/run.py @@ -62,7 +62,7 @@ # build GPT model model, tokenizer, train_loss_fn, eval_loss_fn = impls.build_model(config) - if config.Global.mix_precision.use_pure_fp16: + if config.Global.mix_precision.enable: scaler = paddle.amp.GradScaler( init_loss_scaling=config.Global.mix_precision.scale_loss) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when @@ -98,14 +98,14 @@ if 'multi_precision' in config.Optimizer: assert config.Optimizer.pop('multi_precision') \ - == config.Global.mix_precision.use_pure_fp16 + == config.Global.mix_precision.enable lr_scheduler = cpn.build_lr_scheduler(config.Optimizer.lr) optimizer = cpn.build_optimizer( config.Optimizer, model, lr_scheduler, - multi_precision=config.Global.mix_precision.use_pure_fp16) + multi_precision=config.Global.mix_precision.enable) # call fleet wrapper if nranks > 1: diff --git a/examples/transformer/models/GPT/generation/configs/generation_gpt_base.yaml b/examples/transformer/models/GPT/generation/configs/generation_gpt_base.yaml index 2b1c12555..f14e25ffd 100644 --- a/examples/transformer/models/GPT/generation/configs/generation_gpt_base.yaml +++ b/examples/transformer/models/GPT/generation/configs/generation_gpt_base.yaml @@ -14,7 +14,7 @@ Global: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/examples/transformer/models/GPT/generation/export.py b/examples/transformer/models/GPT/generation/export.py index 174668c53..d497026fa 100644 --- a/examples/transformer/models/GPT/generation/export.py +++ b/examples/transformer/models/GPT/generation/export.py @@ -51,8 +51,8 @@ cfg.process_configs(config) cfg.print_config(config) - if config.Global.mix_precision.use_pure_fp16: - logger.info("NOTE: disable use_pure_fp16 in export mode") + if config.Global.mix_precision.enable: + logger.info("NOTE: disable mix_precision in export mode") # build GPT model model, _ = impls.build_model(config) diff --git a/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml b/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml index 7754b2dd7..8af494f54 100644 --- a/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml +++ b/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml @@ -14,7 +14,7 @@ Global: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/examples/transformer/models/GPT/offline-eval/impls.py b/examples/transformer/models/GPT/offline-eval/impls.py index 010bbb615..4c24410d5 100644 --- a/examples/transformer/models/GPT/offline-eval/impls.py +++ b/examples/transformer/models/GPT/offline-eval/impls.py @@ -61,7 +61,7 @@ def build_model(config): def eval_impl(config, batch, model): model.eval() - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list diff --git a/examples/transformer/models/GPT/offline-eval/run.py b/examples/transformer/models/GPT/offline-eval/run.py index 696f6a344..823f78429 100644 --- a/examples/transformer/models/GPT/offline-eval/run.py +++ b/examples/transformer/models/GPT/offline-eval/run.py @@ -77,7 +77,7 @@ ] model, quanter = qat.compress_model(config, model, input_spec) - if config.Global.mix_precision.use_pure_fp16: + if config.Global.mix_precision.enable: scaler = paddle.amp.GradScaler( init_loss_scaling=config.Global.mix_precision.scale_loss) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when diff --git a/examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_base.yaml b/examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_base.yaml index d21a2dcc3..6a89fa441 100644 --- a/examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_base.yaml +++ b/examples/transformer/models/GPT/pretrain/configs/pretrain_gpt_base.yaml @@ -14,7 +14,7 @@ Global: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/examples/transformer/models/GPT/pretrain/export.py b/examples/transformer/models/GPT/pretrain/export.py index 69e55030c..7e1d6ebdc 100644 --- a/examples/transformer/models/GPT/pretrain/export.py +++ b/examples/transformer/models/GPT/pretrain/export.py @@ -51,8 +51,8 @@ cfg.process_configs(config) cfg.print_config(config) - if config.Global.mix_precision.use_pure_fp16: - logger.info("NOTE: disable use_pure_fp16 in export mode") + if config.Global.mix_precision.enable: + logger.info("NOTE: disable mix_precision in export mode") # build GPT model model, _, _ = impls.build_model(config) diff --git a/examples/transformer/models/GPT/pretrain/impls.py b/examples/transformer/models/GPT/pretrain/impls.py index 2731c0f79..1f91c8c64 100644 --- a/examples/transformer/models/GPT/pretrain/impls.py +++ b/examples/transformer/models/GPT/pretrain/impls.py @@ -101,7 +101,7 @@ def build_model(config): def model_forward_backward(config, batch, forward_func, **kwargs): acc_steps = config.Global.accumulate_steps - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list @@ -165,7 +165,7 @@ def model_forward_backward(config, batch, forward_func, **kwargs): def optim_update_params(config, **kwargs): hcg = env.get_hcg() - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable dp_degree = config.Distributed.dp_degree sharding_stage = config.Distributed.sharding.sharding_stage @@ -221,7 +221,7 @@ def fit_impl(config, batch, forward_func, **kwargs): def eval_impl(config, batch, model, loss_fn): model.eval() - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list diff --git a/examples/transformer/models/GPT/pretrain/run.py b/examples/transformer/models/GPT/pretrain/run.py index ac0a547da..ab31d9927 100644 --- a/examples/transformer/models/GPT/pretrain/run.py +++ b/examples/transformer/models/GPT/pretrain/run.py @@ -83,7 +83,7 @@ ] model, quanter = qat.compress_model(config, model, input_spec) - if config.Global.mix_precision.use_pure_fp16: + if config.Global.mix_precision.enable: scaler = paddle.amp.GradScaler( init_loss_scaling=config.Global.mix_precision.scale_loss) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when @@ -104,7 +104,7 @@ config.Optimizer, model, lr_scheduler, - multi_precision=config.Global.mix_precision.use_pure_fp16) + multi_precision=config.Global.mix_precision.enable) # call fleet wrapper if nranks > 1: diff --git a/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_345M_single_card.yaml b/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_345M_single_card.yaml index f3936ef2d..24aa25c1e 100644 --- a/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_345M_single_card.yaml +++ b/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_345M_single_card.yaml @@ -7,7 +7,7 @@ Global: max_steps: 20000 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True Data: Train: diff --git a/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_base.yaml b/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_base.yaml index d4b6e48ec..cc52157a4 100644 --- a/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_base.yaml +++ b/examples/transformer/models/GPT/pretrain_moe/configs/pretrain_moe_base.yaml @@ -14,7 +14,7 @@ Global: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/examples/transformer/models/GPT/pretrain_moe/impls.py b/examples/transformer/models/GPT/pretrain_moe/impls.py index 2224aa4ff..da8a2c8fb 100644 --- a/examples/transformer/models/GPT/pretrain_moe/impls.py +++ b/examples/transformer/models/GPT/pretrain_moe/impls.py @@ -59,7 +59,7 @@ def _get_model_size(l, h, v, s, ne, ei): # gate P += (h * nei + nei) # experts - P += nei * (8 * h * h + 5 * h) + P += nei * (8 * h * h + 5 * h) # FFN Layer else: P += 8 * h * h + 5 * h @@ -120,7 +120,7 @@ def build_model(config): def model_forward_backward(config, batch, forward_func, **kwargs): acc_steps = config.Global.accumulate_steps - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list @@ -199,7 +199,7 @@ def model_forward_backward(config, batch, forward_func, **kwargs): def optim_update_params(config, **kwargs): hcg = env.get_hcg() - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable dp_degree = config.Distributed.dp_degree sharding_stage = config.Distributed.sharding.sharding_stage @@ -255,7 +255,7 @@ def fit_impl(config, batch, forward_func, **kwargs): def eval_impl(config, batch, model, loss_fn): model.eval() - use_fp16 = config.Global.mix_precision.use_pure_fp16 + use_fp16 = config.Global.mix_precision.enable black_list = config.Global.mix_precision.custom_black_list white_list = config.Global.mix_precision.custom_white_list diff --git a/examples/transformer/models/GPT/pretrain_moe/run.py b/examples/transformer/models/GPT/pretrain_moe/run.py index 78c85b07f..bbd677034 100644 --- a/examples/transformer/models/GPT/pretrain_moe/run.py +++ b/examples/transformer/models/GPT/pretrain_moe/run.py @@ -84,7 +84,7 @@ ] model, quanter = qat.compress_model(config, model, input_spec) - if config.Global.mix_precision.use_pure_fp16: + if config.Global.mix_precision.enable: scaler = paddle.amp.GradScaler( init_loss_scaling=config.Global.mix_precision.scale_loss) # Note: Save dtype is the same as model dtype. Also can set save_dtype='float32' when @@ -105,7 +105,7 @@ config.Optimizer, model, lr_scheduler, - multi_precision=config.Global.mix_precision.use_pure_fp16) + multi_precision=config.Global.mix_precision.enable) # call fleet wrapper if nranks > 1: diff --git a/examples/transformer/utils/config.py b/examples/transformer/utils/config.py index d5a6bf8ec..0c04a813d 100644 --- a/examples/transformer/utils/config.py +++ b/examples/transformer/utils/config.py @@ -413,7 +413,7 @@ def process_global_configs(config): global_cfg['mix_precision'] = global_cfg.get('mix_precision', {}) amp_cfg = global_cfg.mix_precision - amp_cfg['use_pure_fp16'] = amp_cfg.get('use_pure_fp16', False) + amp_cfg['enable'] = amp_cfg.get('enable', False) amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768) amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None) amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None) diff --git a/ppfleetx/configs/multimodal/imagen/imagen_base.yaml b/ppfleetx/configs/multimodal/imagen/imagen_base.yaml index f8101841c..4f005107b 100644 --- a/ppfleetx/configs/multimodal/imagen/imagen_base.yaml +++ b/ppfleetx/configs/multimodal/imagen/imagen_base.yaml @@ -15,7 +15,7 @@ Engine: eval_freq: 10000000 eval_iters: 10000000 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml b/ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml index 75624da78..b6f1febe3 100644 --- a/ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml +++ b/ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml @@ -37,7 +37,7 @@ Engine: eval_freq: 10000000 eval_iters: 10000000 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base.yaml b/ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base.yaml index 6315f3378..a4026fb09 100644 --- a/ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base.yaml +++ b/ppfleetx/configs/nlp/ernie/auto/pretrain_ernie_base.yaml @@ -17,7 +17,7 @@ Engine: eval_iters: 10 test_iters: -1 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/nlp/ernie/finetune_ernie_base.yaml b/ppfleetx/configs/nlp/ernie/finetune_ernie_base.yaml index 25dd7609e..b53d143ca 100644 --- a/ppfleetx/configs/nlp/ernie/finetune_ernie_base.yaml +++ b/ppfleetx/configs/nlp/ernie/finetune_ernie_base.yaml @@ -17,7 +17,7 @@ Engine: eval_iters: 10 test_iters: -1 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml b/ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml index 72bc63856..747cadc08 100644 --- a/ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml +++ b/ppfleetx/configs/nlp/ernie/pretrain_ernie_base.yaml @@ -17,7 +17,7 @@ Engine: eval_iters: 10 test_iters: -1 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/nlp/ernie/qat_ernie_base.yaml b/ppfleetx/configs/nlp/ernie/qat_ernie_base.yaml index 770d760f0..e19b50ce7 100644 --- a/ppfleetx/configs/nlp/ernie/qat_ernie_base.yaml +++ b/ppfleetx/configs/nlp/ernie/qat_ernie_base.yaml @@ -17,7 +17,7 @@ Engine: eval_iters: 10 test_iters: -1 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml b/ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml index fc7bb1821..20be3699d 100644 --- a/ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml +++ b/ppfleetx/configs/nlp/gpt/finetune_gpt_345M_single_card_glue.yaml @@ -13,7 +13,7 @@ Engine: logging_freq: 10 eval_freq: 1 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div", "reduce_mean"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index 54fa8d896..1979a2436 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -16,7 +16,7 @@ Engine: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True dtype: "float16" level: "O2" scale_loss: 32768.0 diff --git a/ppfleetx/configs/nlp/moe/pretrain_moe_base.yaml b/ppfleetx/configs/nlp/moe/pretrain_moe_base.yaml index 54cbbefbe..af970a16d 100644 --- a/ppfleetx/configs/nlp/moe/pretrain_moe_base.yaml +++ b/ppfleetx/configs/nlp/moe/pretrain_moe_base.yaml @@ -16,7 +16,7 @@ Engine: eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] diff --git a/ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml b/ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml index 825300b6d..580c12f73 100644 --- a/ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml +++ b/ppfleetx/configs/vis/moco/moco_lincls_in1k_1n8c.yaml @@ -12,7 +12,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml b/ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml index 908a083a4..8ece5faca 100644 --- a/ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml +++ b/ppfleetx/configs/vis/moco/mocov1_pt_in1k_1n8c.yaml @@ -12,7 +12,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml b/ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml index 83f030e76..37517d9a9 100644 --- a/ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml +++ b/ppfleetx/configs/vis/moco/mocov2_pt_in1k_1n8c.yaml @@ -12,7 +12,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml b/ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml index b7c23463a..f0bee287f 100644 --- a/ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml +++ b/ppfleetx/configs/vis/vit/ViT_base_patch16_224_inference.yaml @@ -23,7 +23,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml index 928bdec26..fa1fe7f56 100644 --- a/ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml +++ b/ppfleetx/configs/vis/vit/ViT_base_patch16_224_pt_in1k_2n16c_dp_fp16o2.yaml @@ -11,7 +11,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml index dcbee1748..d35d038b4 100644 --- a/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml +++ b/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml @@ -11,7 +11,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml index c0d4ad3f7..fc2cf1dbc 100644 --- a/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml +++ b/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_cifar10_1n8c_dp_fp16o2.yaml @@ -11,7 +11,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: False + enable: False scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml index 42beea034..7ff2b3dd5 100644 --- a/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml +++ b/ppfleetx/configs/vis/vit/ViT_base_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml @@ -11,7 +11,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml index b3baddd82..12bea877a 100644 --- a/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml +++ b/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_in1k_2n16c_dp_fp16o2.yaml @@ -11,7 +11,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml index 7887d854a..9ef6f77b8 100644 --- a/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml +++ b/ppfleetx/configs/vis/vit/ViT_large_patch16_384_ft_qat_in1k_2n16c_dp_fp16o2.yaml @@ -11,7 +11,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/configs/vis/vit/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml b/ppfleetx/configs/vis/vit/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml index 12766dc9f..165abf4e2 100644 --- a/ppfleetx/configs/vis/vit/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml +++ b/ppfleetx/configs/vis/vit/ViT_tiny_patch16_224_ci_cifar10_1n8c_dp_fp16o2.yaml @@ -11,7 +11,7 @@ Engine: accumulate_steps: 1 logging_freq: 10 mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "elementwise_div"] custom_white_list: [] diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index ecfa9438f..24fca77dd 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -131,10 +131,10 @@ def configure_optimizers(self): self._accumulate_steps = self._configs['accumulate_steps'] amp_config = self._configs['mix_precision'] - self._use_pure_fp16 = amp_config['use_pure_fp16'] - if mode == 'export' and self._use_pure_fp16: - logger.info("NOTE: disable use_pure_fp16 in export mode") - self._use_pure_fp16 = False + self._amp_enable = amp_config['enable'] + if mode == 'export' and self._amp_enable: + logger.info("NOTE: disable mix_precision in export mode") + self._amp_enable = False self._amp_dtype = amp_config.get('dtype', 'float16') self._amp_level = amp_config.get('level', 'O2') @@ -177,7 +177,7 @@ def configure_optimizers(self): self._use_recompute = configs['Model']['use_recompute'] - if self._use_pure_fp16: + if self._amp_enable: if mode == 'train' and self._amp_dtype == "float16": self._scaler = paddle.amp.GradScaler( init_loss_scaling=self._scale_loss) @@ -339,7 +339,7 @@ def _train_one_epoch(self, 'loss': sum(numpy_losses) / len(numpy_losses), 'lr': self._optimizer.get_lr() } - if self._use_pure_fp16: + if self._amp_enable: log_dict['loss_scale'] = self._scaler._scale self._module.training_step_end(log_dict) @@ -471,7 +471,7 @@ def _fit_impl(self, batch): loss = self._model_forward_backward(batch) else: with paddle.amp.auto_cast( - enable=self._use_pure_fp16, + enable=self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, @@ -498,14 +498,14 @@ def _model_forward_backward(self, batch): final_loss = None for micro_batch in batches: with paddle.amp.auto_cast( - self._use_pure_fp16, + self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, level=self._amp_level): loss = self._module.training_step(micro_batch) - loss_bw = self._scaler.scale(loss) if self._use_pure_fp16 else loss + loss_bw = self._scaler.scale(loss) if self._amp_enable else loss if self._accumulate_steps > 1: # div the loss for backward loss_bw = loss_bw / self._accumulate_steps @@ -533,7 +533,7 @@ def _optim_update_params(self): p.bw_storage.scale_(1.0 / self._dp_group.nranks) dist.all_reduce(p.bw_storage, group=self._dp_group) - if self._use_pure_fp16: + if self._amp_enable: self._scaler.step(self._optimizer) self._scaler.update() else: @@ -606,7 +606,7 @@ def _evaluate_impl(self, batch): batch = self._module.pretreating_batch(batch) with paddle.amp.auto_cast( - self._use_pure_fp16, + self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, @@ -664,7 +664,7 @@ def _predict_impl(self, batch): batch = self._module.pretreating_batch(batch) with paddle.amp.auto_cast( - self._use_pure_fp16, + self._amp_enable, custom_black_list=self._custom_black_list, custom_white_list=self._custom_white_list, dtype=self._amp_dtype, diff --git a/ppfleetx/models/multimodal_model/utils.py b/ppfleetx/models/multimodal_model/utils.py index adb76db15..5e175326d 100644 --- a/ppfleetx/models/multimodal_model/utils.py +++ b/ppfleetx/models/multimodal_model/utils.py @@ -112,7 +112,7 @@ def process_optim_configs(config): process optim configs for hybrid parallel """ config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][ - 'use_pure_fp16'] + 'enable'] def process_engine_configs(config): diff --git a/ppfleetx/models/vision_model/moco/moco.py b/ppfleetx/models/vision_model/moco/moco.py index 6a58b74a8..8a6f3a010 100644 --- a/ppfleetx/models/vision_model/moco/moco.py +++ b/ppfleetx/models/vision_model/moco/moco.py @@ -135,7 +135,7 @@ def __init__(self, @paddle.no_grad() def _update_momentum_encoder(self): """Momentum update of the momentum encoder""" - #Note(GuoxiaWang): disable auto cast when use use_pure_fp16 + #Note(GuoxiaWang): disable auto cast when use mix_precision with paddle.amp.auto_cast(False): for param_b, param_m in zip(self.base_encoder.parameters(), self.momentum_encoder.parameters()): diff --git a/ppfleetx/utils/config.py b/ppfleetx/utils/config.py index c51529633..cb6fed7de 100644 --- a/ppfleetx/utils/config.py +++ b/ppfleetx/utils/config.py @@ -165,7 +165,7 @@ def process_engine_config(config): config.Engine['mix_precision'] = config.Engine.get('mix_precision', {}) amp_cfg = config.Engine.mix_precision - amp_cfg['use_pure_fp16'] = amp_cfg.get('use_pure_fp16', False) + amp_cfg['enable'] = amp_cfg.get('enable', False) amp_cfg['scale_loss'] = amp_cfg.get('scale_loss', 32768) amp_cfg['custom_black_list'] = amp_cfg.get('custom_black_list', None) amp_cfg['custom_white_list'] = amp_cfg.get('custom_white_list', None) @@ -518,7 +518,7 @@ def process_auto_strategy(config): amp_cfg = config.Engine.get('mix_precision', {}) amp = strategy.amp amp.enable = amp_cfg.get('level', "") in ['o1', 'o2', 'o3'] - amp.use_pure_fp16 = amp_cfg.get('level', "") in ['o2', 'o3'] + amp.enable = amp_cfg.get('level', "") in ['o2', 'o3'] amp.use_optimizer_fp16 = amp_cfg.get('level', "") in ['o3'] amp.use_fp16_guard = amp_cfg.get('use_fp16_guard', False) amp.init_loss_scaling = amp_cfg.get('scale_loss', 32768) diff --git a/projects/gpt/docs/README.md b/projects/gpt/docs/README.md index 7ae21c469..5c332171e 100644 --- a/projects/gpt/docs/README.md +++ b/projects/gpt/docs/README.md @@ -131,7 +131,7 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, eval_iters: 10 test_iters: mix_precision: - use_pure_fp16: True + enable: True scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -152,7 +152,7 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | | test_iters | 模型测试或推理时的轮数 | -| use_pure_fp16 | 是否使用purefp16精度训练 | +| enable | 是否使用purefp16精度训练 | | scale_loss | 使用fp16精度下,loss的放缩比例 | | custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16计算。 | | custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16计算。| diff --git a/projects/imagen/run_super_resolution_1024_sharding128.sh b/projects/imagen/run_super_resolution_1024_sharding128.sh index 86626aace..6c21fba5f 100644 --- a/projects/imagen/run_super_resolution_1024_sharding128.sh +++ b/projects/imagen/run_super_resolution_1024_sharding128.sh @@ -22,6 +22,6 @@ python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6, -c ./ppfleetx/configs/multimodal/imagen/imagen_super_resolution_1024.yaml \ -o Distributed.sharding.sharding_stage=2 \ -o Distributed.sharding.sharding_degree=8 \ - -o Engine.mix_precision.use_pure_fp16=False \ + -o Engine.mix_precision.enable=False \ -o Data.Train.loader.batch_size=1 \ -o Model.use_recompute=True \ From 6bb1e03d64028e482c0be2b6ab6d946aac19540d Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Sat, 4 Mar 2023 06:25:41 +0000 Subject: [PATCH 08/25] support main_grad --- .../configs/nlp/gpt/pretrain_gpt_base.yaml | 1 - ppfleetx/core/engine/eager_engine.py | 14 +- ppfleetx/distributed/apis/amp.py | 242 ++++++++++++++++++ .../models/language_model/language_module.py | 6 +- ppfleetx/models/language_model/utils.py | 8 +- ppfleetx/optims/__init__.py | 11 +- ppfleetx/optims/optimizer.py | 2 +- ppfleetx/utils/tensor_fusion_helper.py | 2 +- 8 files changed, 272 insertions(+), 14 deletions(-) create mode 100644 ppfleetx/distributed/apis/amp.py diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index 1979a2436..0fbcb6a2d 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -86,7 +86,6 @@ Optimizer: grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 - multi_precision: False tensor_fusion: False diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 24fca77dd..efa20b52d 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -23,14 +23,14 @@ import paddle.distributed.fleet as fleet from paddle.optimizer.lr import LRScheduler -from paddle.fluid.dygraph.parallel import sync_params_buffers +from paddle.distributed.parallel import sync_params_buffers from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from paddle.profiler import SummaryView from paddle.distributed.fleet.meta_parallel import TensorParallel from paddle.distributed.sharding import group_sharded_parallel import paddleslim -from ppfleetx.distributed.apis import env +from ppfleetx.distributed.apis import env, amp from ppfleetx.optims import build_lr_scheduler, build_optimizer from ppfleetx.utils.log import logger, get_timestamp, convert_timestamp_to_data from ppfleetx.core.engine import BasicEngine, InferenceEngine, TensorRTConfig @@ -208,6 +208,14 @@ def configure_optimizers(self): configs.Optimizer, self._module.model, self._lr_scheduler) if mode == 'train' else None + if self._amp_enable and self._amp_dtype in [ + 'float16', 'bfloat16' + ] and self._amp_level == 'O2': + self._module.model = amp.MixPrecisionLayer( + self._module.model, dtype=self._amp_dtype) + self._optimizer = amp.MixPrecisionOptimizer(self._optimizer) + self._scaler = amp.MixPrecisionScaler(self._scaler) + # distributed configs self._distributed = (dist.get_world_size() > 1) @@ -340,7 +348,7 @@ def _train_one_epoch(self, 'lr': self._optimizer.get_lr() } if self._amp_enable: - log_dict['loss_scale'] = self._scaler._scale + log_dict['loss_scale'] = self._scaler._scale.numpy()[0] self._module.training_step_end(log_dict) train_step_start = get_timestamp() diff --git a/ppfleetx/distributed/apis/amp.py b/ppfleetx/distributed/apis/amp.py new file mode 100644 index 000000000..cd4812c56 --- /dev/null +++ b/ppfleetx/distributed/apis/amp.py @@ -0,0 +1,242 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict +from types import MethodType +import numpy as np + +import paddle +import paddle.nn as nn +from paddle import _legacy_C_ops +from paddle.distributed import fleet +from paddle.fluid.dygraph import to_variable +from paddle.fluid import framework +from paddle.fluid.dygraph import base as imperative_base +from paddle.framework import core + +from ppfleetx.distributed.apis import env + + +class MixPrecisionLayer(nn.Layer): + def __init__(self, layers, dtype="float16"): + super().__init__(layers.full_name() + "_mix_precision") + + self._layers = layers + self._dtype = dtype + + assert self._dtype in ["float16", "bfloat16"] + + for param in self._layers.parameters(): + if not param.stop_gradient and not hasattr(param, "main_grad"): + setattr(param, "main_grad", None) + param._register_backward_hook( + self._update_main_grad_hook(param)) + + def _update_main_grad_hook(self, param): + """Create the update_main_grad hook for backprop.""" + + # Hook used for back-prop. + @paddle.autograd.no_grad() + def param_hook(*_): + # Add the gradient to the buffer. + assert param.grad is not None and param.grad.value().get_tensor( + )._is_initialized() + + if param.main_grad is None: + param.main_grad = core.eager.Tensor( + value=param.grad.cast(paddle.float32).value(), + place=param.grad.place, + name="main_grad@" + param.name, ) + else: + param.main_grad.add_(param.grad.cast(paddle.float32)) + + param.clear_gradient(False) + + return param_hook + + def forward(self, *inputs, **kwargs): + outputs = self._layers(*inputs, **kwargs) + + return outputs + + def state_dict( + self, + destination=None, + include_sublayers=True, + structured_name_prefix="", ): + + return self._layers.state_dict( + destination=destination, + include_sublayers=include_sublayers, + structured_name_prefix=structured_name_prefix, ) + + @framework.deprecate_stat_dict + def set_state_dict(self, state_dict, use_structured_name=True): + + self._layers.set_state_dict( + state_dict, use_structured_name=use_structured_name) + + +class MixPrecisionOptimizer: + def __init__(self, optimizer): + self._inner_opt = optimizer + self._parameter_list = self._obtain_optimizer_parameters_list() + + def _obtain_optimizer_parameters_list(self): + if getattr(self._inner_opt, '_param_groups', None) and isinstance( + self._inner_opt._param_groups[0], dict): + parameters_list = [] + for group in self._inner_opt._param_groups: + for param in group['params']: + parameters_list.append(param) + else: + parameters_list = [ + param for param in self._inner_opt._parameter_list + ] + + return parameters_list + + @imperative_base.no_grad + @framework.dygraph_only + def step(self): + + if not isinstance(self._parameter_list[0], dict): + params_grads = [] + for param in self._parameter_list: + if param.stop_gradient: + continue + assert param._grad_ivar() is None + grad_var = param.main_grad + if framework.in_dygraph_mode(): + if (hasattr(grad_var, "is_selected_rows") and + grad_var.is_selected_rows() and + self._inner_opt.regularization is not None): + raise RuntimeError( + "AdamW don't support weight_decay with sparse parameters, please set it to None." + ) + else: + if (hasattr(grad_var, "_is_sparse") and + grad_var._is_sparse() and + self._inner_opt.regularization is not None): + raise RuntimeError( + "AdamW don't support weight_decay with sparse parameters, please set it to None." + ) + params_grads.append((param, grad_var)) + + optimize_ops = self._inner_opt._apply_optimize( + loss=None, startup_program=None, params_grads=params_grads) + else: + # optimize parameters in groups + for param_group in self._inner_opt._param_groups: + params_grads = defaultdict(lambda: list()) + for param in param_group['params']: + if param.stop_gradient: + continue + assert param._grad_ivar() is None + grad_var = param.main_grad + if framework.in_dygraph_mode(): + if (hasattr(grad_var, "is_selected_rows") and + grad_var.is_selected_rows() and + self._inner_opt.regularization is not None): + raise RuntimeError( + "AdamW don't support weight_decay with sparse parameters, please set it to None." + ) + else: + if (hasattr(grad_var, "_is_sparse") and + grad_var._is_sparse() and + self._inner_opt.regularization is not None): + raise RuntimeError( + "AdamW don't support weight_decay with sparse parameters, please set it to None." + ) + params_grads['params'].append((param, grad_var)) + params_grads.update( + {k: v + for k, v in param_group.items() if k != 'params'}) + self._apply_optimize( + loss=None, startup_program=None, params_grads=params_grads) + + @framework.dygraph_only + def clear_grad(self, set_to_zero=True): + + param_list = [] + if self._parameter_list is None or not isinstance( + self._parameter_list[0], dict): + for p in self._parameter_list: + if not p.stop_gradient: + param_list.append(p) + else: + for param_group in self._param_groups: + for p in param_group['params']: + if not p.stop_gradient: + param_list.append(p) + + for p in param_list: + if hasattr(p, "main_grad"): + assert p.main_grad is not None + if set_to_zero: + p.main_grad.zero_() + else: + p.main_grad._clear() + p.main_grad = None + else: + p.clear_gradient(set_to_zero) + + def __getattr__(self, item): + return getattr(self._inner_opt, item) + + +def unscale_method(self, optimizer): + if not self._enable: + return + param_grads = [] + if getattr(optimizer, '_param_groups', None) and isinstance( + optimizer._param_groups[0], dict): + for group in optimizer._param_groups: + for param in group['params']: + assert param._grad_ivar() is None + if param.main_grad is not None: + assert param.main_grad.dtype == core.VarDesc.VarType.FP32 + param_grads.append(param.main_grad) + else: + for param in optimizer._parameter_list: + assert param._grad_ivar() is None + if param.main_grad is not None: + assert param.main_grad.dtype == core.VarDesc.VarType.FP32 + param_grads.append(param.main_grad) + + temp_found_inf = to_variable(np.array([0]).astype(np.bool_)) + if len(param_grads): + _legacy_C_ops.check_finite_and_unscale( + param_grads, + self._scale, + param_grads, + temp_found_inf, ) + + self._found_inf = 1 if temp_found_inf else 0 + + hcg = env.get_hcg() + if hcg is not None and hcg.nranks > hcg.get_data_parallel_world_size(): + is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") + paddle.distributed.all_reduce( + is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None) + self._found_inf = is_found_inf.numpy()[0] + + +class MixPrecisionScaler: + def __init__(self, scaler): + self._inner_scaler = scaler + self._inner_scaler._unscale = MethodType(unscale_method, scaler) + + def __getattr__(self, item): + return getattr(self._inner_scaler, item) diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index 68658e3e3..474134f96 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -75,9 +75,9 @@ def training_step_end(self, log_dict): default_global_tokens_num = self.configs.Global.global_batch_size * \ self.configs.Data.Train.dataset.max_seq_len - loss_scale_str = "loss_scale: %.9f,".format(log_dict[ - 'loss_scale']) if log_dict.get('loss_scale', - None) is not None else "" + loss_scale_str = "loss_scale: %.9f," % ( + log_dict['loss_scale']) if log_dict.get('loss_scale', + None) is not None else "" logger.info( "[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ "ips_total: %.0f tokens/s, ips: %.0f tokens/s, %s learning rate: %.5e" diff --git a/ppfleetx/models/language_model/utils.py b/ppfleetx/models/language_model/utils.py index 462743a3e..a3da33d46 100644 --- a/ppfleetx/models/language_model/utils.py +++ b/ppfleetx/models/language_model/utils.py @@ -127,7 +127,7 @@ def process_optim_configs(config): process optim configs for hybrid parallel """ config['Optimizer']['multi_precision'] = config['Engine']['mix_precision'][ - 'use_pure_fp16'] + 'enable'] nranks = dist.get_world_size() dp_degree = config['Distributed']['dp_degree'] @@ -136,6 +136,12 @@ def process_optim_configs(config): assert nranks == dp_degree * sharding_degree, \ "tensor_fusion only support single card train or data/sharding parallel train" + if config['Optimizer']['lr']['decay_steps'] is None: + config['Optimizer']['lr']['decay_steps'] = config['Engine'][ + 'max_steps'] + config['Optimizer']['lr']['decay_steps'] *= config['Global'][ + 'global_batch_size'] + def process_data_configs(config): """ diff --git a/ppfleetx/optims/__init__.py b/ppfleetx/optims/__init__.py index 8c5595fc3..6537e3143 100644 --- a/ppfleetx/optims/__init__.py +++ b/ppfleetx/optims/__init__.py @@ -43,11 +43,10 @@ def build_lr_scheduler(lr_config): def build_grad_clip(grad_clip_config): if grad_clip_config is not None: - multi_precision = grad_clip_config.pop('multi_precision', False) - if multi_precision: - paddle.nn.clip._clip_by_global_norm_using_mp_type(True) grad_clip_name = grad_clip_config.pop('name', 'ClipGradByGlobalNorm') - grad_clip = eval(grad_clip_name)(**grad_clip_config) + clip_norm = grad_clip_config.get('clip_norm', 1.0) + grad_clip = eval(grad_clip_name)( + **grad_clip_config) if clip_norm != 0. else None return grad_clip else: return None @@ -58,6 +57,10 @@ def build_optimizer(config, model, lr_scheduler=None): if lr_scheduler is not None: config.pop('lr') + multi_precision = config.get('multi_precision', False) + if multi_precision: + paddle.nn.clip._clip_by_global_norm_using_mp_type(True) + grad_clip_config = config.pop('grad_clip', None) grad_clip = build_grad_clip(grad_clip_config) diff --git a/ppfleetx/optims/optimizer.py b/ppfleetx/optims/optimizer.py index f8f0efba1..f3fcbb3e1 100644 --- a/ppfleetx/optims/optimizer.py +++ b/ppfleetx/optims/optimizer.py @@ -43,7 +43,7 @@ def __init__(self, learning_rate, parameters, grad_clip, **config): else: decay_params = [ p.name for p in parameters - if not any(nd in p.name for nd in ["bias", "norm"]) + if not any(nd in p.name for nd in ["bias", "norm", "b_0"]) ] apply_decay_param_fun = lambda x: x in decay_params diff --git a/ppfleetx/utils/tensor_fusion_helper.py b/ppfleetx/utils/tensor_fusion_helper.py index e3d77369d..0bb107f89 100644 --- a/ppfleetx/utils/tensor_fusion_helper.py +++ b/ppfleetx/utils/tensor_fusion_helper.py @@ -92,7 +92,7 @@ def fused_parameters(parameters, use_sharding=False): other_params = [] for param in parameters: - if not any(nd in param.name for nd in ["bias", "norm"]): + if not any(nd in param.name for nd in ["bias", "norm", "b_0"]): decay_params.append(param) else: other_params.append(param) From cf7fa387cc4254095a98f8ffc36879a333185529 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Sat, 4 Mar 2023 10:58:50 +0000 Subject: [PATCH 09/25] update --- ppfleetx/distributed/apis/amp.py | 34 +++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/ppfleetx/distributed/apis/amp.py b/ppfleetx/distributed/apis/amp.py index cd4812c56..d8b6e6ecc 100644 --- a/ppfleetx/distributed/apis/amp.py +++ b/ppfleetx/distributed/apis/amp.py @@ -19,7 +19,6 @@ import paddle import paddle.nn as nn from paddle import _legacy_C_ops -from paddle.distributed import fleet from paddle.fluid.dygraph import to_variable from paddle.fluid import framework from paddle.fluid.dygraph import base as imperative_base @@ -40,31 +39,42 @@ def __init__(self, layers, dtype="float16"): for param in self._layers.parameters(): if not param.stop_gradient and not hasattr(param, "main_grad"): setattr(param, "main_grad", None) - param._register_backward_hook( - self._update_main_grad_hook(param)) + param._register_grad_hook(self._update_main_grad_hook(param)) + # TODO: remove _release_grad_hook after solving the issue in _update_main_grad_hook + param._register_backward_hook(self._release_grad_hook(param)) def _update_main_grad_hook(self, param): """Create the update_main_grad hook for backprop.""" - # Hook used for back-prop. + # Hook used for back-prop and grad-merge. @paddle.autograd.no_grad() - def param_hook(*_): - # Add the gradient to the buffer. - assert param.grad is not None and param.grad.value().get_tensor( - )._is_initialized() - + def param_hook(tmp_grad): + # TODO: cancel the comments of the checking code + # assert param.grad is None, "param.grad is not None" if param.main_grad is None: param.main_grad = core.eager.Tensor( - value=param.grad.cast(paddle.float32).value(), - place=param.grad.place, + value=tmp_grad.cast(paddle.float32).value(), + place=tmp_grad.place, name="main_grad@" + param.name, ) else: - param.main_grad.add_(param.grad.cast(paddle.float32)) + param.main_grad.add_(tmp_grad.cast(paddle.float32)) + # NOTE: It doesn't work. param.clear_gradient(False) + return None return param_hook + def _release_grad_hook(self, param): + """Create the release_main_grad hook for backprop.""" + + # Hook used for back-prop and grad-merge. + @paddle.autograd.no_grad() + def release_hook(*_): + param.clear_gradient(False) + + return release_hook + def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) From 3012033220065ee5ac88210dcb295cc2afd565cc Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Sun, 5 Mar 2023 02:40:47 +0000 Subject: [PATCH 10/25] update --- ppfleetx/core/engine/eager_engine.py | 20 ++++++++++++++++++-- ppfleetx/distributed/apis/amp.py | 11 +++-------- ppfleetx/optims/lr_scheduler.py | 15 +++++++++++++++ 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index efa20b52d..5687d788c 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -138,6 +138,7 @@ def configure_optimizers(self): self._amp_dtype = amp_config.get('dtype', 'float16') self._amp_level = amp_config.get('level', 'O2') + self._use_main_grad = amp_config.get('use_main_grad', True) self._scale_loss = amp_config['scale_loss'] self._custom_black_list = amp_config['custom_black_list'] self._custom_white_list = amp_config['custom_white_list'] @@ -210,7 +211,7 @@ def configure_optimizers(self): if self._amp_enable and self._amp_dtype in [ 'float16', 'bfloat16' - ] and self._amp_level == 'O2': + ] and self._amp_level == 'O2' and self._use_main_grad: self._module.model = amp.MixPrecisionLayer( self._module.model, dtype=self._amp_dtype) self._optimizer = amp.MixPrecisionOptimizer(self._optimizer) @@ -300,7 +301,17 @@ def _wrap_sharding_2_3(self): self._broadcast_overlap, layers=origin_model, num_groups=2) def _wrap_3D_parallel(self): - self._module.model = fleet.distributed_model(self._module.model) + if isinstance(self._module.model, amp.MixPrecisionLayer): + if dist.get_world_size() == self._dp_degree: + sync_params_buffers( + self._module.model, + comm_group=self._dp_group, + src_rank=self._dp_group.ranks[0]) + elif self._pp_degree > 1: + self._module.model = fleet.distributed_model( + self._module.model._layers) + else: + self._module.model = fleet.distributed_model(self._module.model) self._optimizer = fleet.distributed_optimizer(self._optimizer) self._scaler = fleet.distributed_scaler( self._scaler) if self._scaler is not None else self._scaler @@ -475,6 +486,11 @@ def _fit_impl(self, batch): else: all_reduce_parameters(self._optimizer.all_fused_tensors, self._dp_group) + elif isinstance(self._module.model, amp.MixPrecisionLayer) \ + and self._distributed and dist.get_world_size() == self._dp_degree: + loss = self._model_forward_backward(batch) + fused_allreduce_gradients( + list(self._module.model.parameters()), None) else: loss = self._model_forward_backward(batch) else: diff --git a/ppfleetx/distributed/apis/amp.py b/ppfleetx/distributed/apis/amp.py index d8b6e6ecc..5eecca75d 100644 --- a/ppfleetx/distributed/apis/amp.py +++ b/ppfleetx/distributed/apis/amp.py @@ -60,7 +60,7 @@ def param_hook(tmp_grad): param.main_grad.add_(tmp_grad.cast(paddle.float32)) # NOTE: It doesn't work. - param.clear_gradient(False) + # param.clear_gradient(False) return None return param_hook @@ -126,7 +126,6 @@ def step(self): for param in self._parameter_list: if param.stop_gradient: continue - assert param._grad_ivar() is None grad_var = param.main_grad if framework.in_dygraph_mode(): if (hasattr(grad_var, "is_selected_rows") and @@ -153,7 +152,6 @@ def step(self): for param in param_group['params']: if param.stop_gradient: continue - assert param._grad_ivar() is None grad_var = param.main_grad if framework.in_dygraph_mode(): if (hasattr(grad_var, "is_selected_rows") and @@ -192,14 +190,13 @@ def clear_grad(self, set_to_zero=True): param_list.append(p) for p in param_list: - if hasattr(p, "main_grad"): - assert p.main_grad is not None + if hasattr(p, "main_grad") and p.main_grad is not None: if set_to_zero: p.main_grad.zero_() else: p.main_grad._clear() p.main_grad = None - else: + elif not hasattr(p, "main_grad"): p.clear_gradient(set_to_zero) def __getattr__(self, item): @@ -214,13 +211,11 @@ def unscale_method(self, optimizer): optimizer._param_groups[0], dict): for group in optimizer._param_groups: for param in group['params']: - assert param._grad_ivar() is None if param.main_grad is not None: assert param.main_grad.dtype == core.VarDesc.VarType.FP32 param_grads.append(param.main_grad) else: for param in optimizer._parameter_list: - assert param._grad_ivar() is None if param.main_grad is not None: assert param.main_grad.dtype == core.VarDesc.VarType.FP32 param_grads.append(param.main_grad) diff --git a/ppfleetx/optims/lr_scheduler.py b/ppfleetx/optims/lr_scheduler.py index aa2b84334..da6b3f26f 100644 --- a/ppfleetx/optims/lr_scheduler.py +++ b/ppfleetx/optims/lr_scheduler.py @@ -58,6 +58,21 @@ def get_lr(self): coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) return self.min_lr + coeff * (self.max_lr - self.min_lr) + # def step(self, epoch=None): + # if epoch is None: + # self.last_epoch += 0 + # self.last_lr = self.get_lr() + # else: + # self.last_epoch += epoch + # if hasattr(self, "_get_closed_form_lr"): + # self.last_lr = self._get_closed_form_lr() + # else: + # self.last_lr = self.get_lr() + + # if self.verbose: + # print('Epoch {}: {} set learning rate to {}.'.format( + # self.last_epoch, self.__class__.__name__, self.last_lr)) + class LinearDecayWithWarmup(LRScheduler): def __init__(self, From 75b9a9f353ab39698c346aff94052acb6a4985e4 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Sun, 5 Mar 2023 02:53:42 +0000 Subject: [PATCH 11/25] update for exps part1 --- ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml | 4 ++-- .../models/language_model/gpt/dygraph/hybrid_model.py | 11 +++++++++-- .../models/language_model/gpt/dygraph/single_model.py | 11 +++++++++-- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index 0fbcb6a2d..b2f34e1c3 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -44,7 +44,7 @@ Data: dataset: name: GPTDataset input_dir: ./data/ - split: [949, 50, 1] + split: [969, 30, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler @@ -59,7 +59,7 @@ Data: dataset: name: GPTDataset input_dir: ./data/ - split: [949, 50, 1] + split: [969, 30, 1] max_seq_len: 1024 sampler: name: GPTBatchSampler diff --git a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index 5ff6c15bd..8f1a242be 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -16,6 +16,8 @@ import collections import logging +from distutils.util import strtobool +import os import paddle import paddle.nn as nn @@ -838,8 +840,12 @@ def forward(self, input_ids=input_ids, position_ids=position_ids) # fused_soiftmax_with_triangular is only suppported on GPU/DCU. + fused_softmax_with_triangular = strtobool( + os.getenv("fused_softmax_with_triangular", True)) + # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. - if self.training == False or not paddle.is_compiled_with_cuda(): + if not fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( + ): # TODO, use registered buffer causal_mask = paddle.tensor.triu( paddle.ones( @@ -858,7 +864,8 @@ def forward(self, encoder_outputs = self.decoder( embedding_output, memory=None, - tgt_mask=None if self.training and paddle.is_compiled_with_cuda() + tgt_mask=None if (fused_softmax_with_triangular and + self.training and paddle.is_compiled_with_cuda()) else attention_mask, # use softmax_mask_fuse_upper_triangle use_cache=use_cache, cache=cache) diff --git a/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/ppfleetx/models/language_model/gpt/dygraph/single_model.py index 067b61454..12d58e88b 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -16,6 +16,8 @@ import collections import logging +from distutils.util import strtobool +import os import paddle import paddle.nn as nn @@ -725,8 +727,12 @@ def forward(self, input_ids=input_ids, position_ids=position_ids) # fused_soiftmax_with_triangular is only suppported on GPU/DCU. + fused_softmax_with_triangular = strtobool( + os.getenv("fused_softmax_with_triangular", True)) + # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. - if self.training == False or not paddle.is_compiled_with_cuda(): + if not fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( + ): # TODO, use registered buffer causal_mask = paddle.tensor.triu( paddle.ones( @@ -745,7 +751,8 @@ def forward(self, encoder_outputs = self.decoder( embedding_output, memory=None, - tgt_mask=None if self.training and paddle.is_compiled_with_cuda() + tgt_mask=None if (fused_softmax_with_triangular and + self.training and paddle.is_compiled_with_cuda()) else attention_mask, # use softmax_mask_fuse_upper_triangle use_cache=use_cache, cache=cache) From 58a337f9ff3be2fd852d291e43743fd6b78add27 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Sun, 5 Mar 2023 03:23:07 +0000 Subject: [PATCH 12/25] update for exps part2 --- ppfleetx/core/engine/eager_engine.py | 16 +- .../gpt/dygraph/single_model.py | 242 ++++++++++++++++-- ppfleetx/optims/lr_scheduler.py | 28 +- 3 files changed, 248 insertions(+), 38 deletions(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 5687d788c..af0443ba5 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -23,7 +23,7 @@ import paddle.distributed.fleet as fleet from paddle.optimizer.lr import LRScheduler -from paddle.distributed.parallel import sync_params_buffers +from paddle.fluid.dygraph.parallel import sync_params_buffers from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from paddle.profiler import SummaryView from paddle.distributed.fleet.meta_parallel import TensorParallel @@ -116,6 +116,9 @@ def configure_optimizers(self): # "'loss_fn' must be sub classes of `paddle.nn.Layer` or any callable function, but got: {module.loss_fn.__class__.__name__}." # ) + # global configs + self._global_batch_size = configs['Global']['global_batch_size'] + # engine configs self._configs = configs['Engine'] @@ -366,7 +369,9 @@ def _train_one_epoch(self, train_losses = [] if self._lr_scheduler is not None and self._lr_scheduler_mode == 'step': - self._lr_scheduler.step() + # TODO: if update_successful + if self._scaler is None or self._scaler._found_inf == 0: + self._lr_scheduler.step(epoch=self._global_batch_size) self._optimizer.clear_grad() @@ -529,7 +534,10 @@ def _model_forward_backward(self, batch): level=self._amp_level): loss = self._module.training_step(micro_batch) - loss_bw = self._scaler.scale(loss) if self._amp_enable else loss + if self._amp_enable and self._amp_dtype == "float16": + loss_bw = self._scaler.scale(loss) + else: + loss_bw = loss if self._accumulate_steps > 1: # div the loss for backward loss_bw = loss_bw / self._accumulate_steps @@ -557,7 +565,7 @@ def _optim_update_params(self): p.bw_storage.scale_(1.0 / self._dp_group.nranks) dist.all_reduce(p.bw_storage, group=self._dp_group) - if self._amp_enable: + if self._amp_enable and self._amp_dtype == "float16": self._scaler.step(self._optimizer) self._scaler.update() else: diff --git a/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/ppfleetx/models/language_model/gpt/dygraph/single_model.py index 12d58e88b..aaeee67a4 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -18,6 +18,7 @@ import logging from distutils.util import strtobool import os +import numpy as np import paddle import paddle.nn as nn @@ -37,6 +38,9 @@ from ppfleetx.models.language_model.moe import MoELayer from ppfleetx.models.language_model.moe_exp.layer import MoE +idx = 0 +save_intermediate = False + from ppfleetx.utils.log import logger try: from paddle.nn.functional.flash_attention import flash_attention @@ -142,9 +146,45 @@ def __init__(self, embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): + # NOTE : save and check attention qkv + + #print("attention qkv debug: ", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [ + # query, + # self.qkv_proj.weight, + # self.qkv_proj.bias, + #]]) + if save_intermediate: + np.save("check_precision/002query%04d" % idx, + paddle.cast(query, 'float32').numpy()) + query.register_hook(lambda grad: np.save("check_precision/grad002query%04d"%idx, paddle.cast(grad, 'float32').numpy())) + mix_layer = self.qkv_proj(query) + + if save_intermediate: + np.save("check_precision/003mix_layer%04d" % idx, + paddle.cast(mix_layer, 'float32').numpy()) + mix_layer.register_hook(lambda grad: np.save("check_precision/grad003mix_layer%04d"%idx, paddle.cast(grad, 'float32').numpy())) + + # NOTE : save and check mix_layer + #print("attention mix_layer", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [mix_layer, ]]) mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim]) + mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) + if save_intermediate: + np.save("check_precision/004mix_layer_trans%04d" % idx, + paddle.cast(mix_layer, 'float32').numpy()) + mix_layer.register_hook(lambda grad: np.save("check_precision/grad004mix_layer_trans%04d"%idx, paddle.cast(grad, 'float32').numpy())) + # NOTE : save and check mix_layer after reshape and transpose q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) + # NOTE : save and check q, k, v + #np.save("check_precision/q_split%04d"%idx, q.numpy()) + #np.save("check_precision/k_split%04d"%idx, k.numpy()) + #np.save("check_precision/v_split%04d"%idx, v.numpy()) assert not isinstance( cache, self.StaticCache @@ -167,13 +207,17 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): """ q = self.q_proj(query) + # NOTE : save and check q q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim]) + q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) + # NOTE : save and check q after reshape and transpose if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) + # NOTE : save and check k/v if isinstance(cache, self.Cache): # for decoder self-attention in inference @@ -197,9 +241,15 @@ def compute_kv(self, key, value): """ k = self.k_proj(key) + # NOTE : save and check k v = self.v_proj(value) + # NOTE : save and check v k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim]) + k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) + # NOTE : save and check k after reshape and transpose v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim]) + v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) + # NOTE : save and check v after reshape and transpose return k, v def gen_cache(self, key, value=None, type=Cache): @@ -249,12 +299,17 @@ def core_attn(self, q, k, v, attn_mask=None): product = paddle.matmul( x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True) + # NOTE : save and check product after matmul + if self.scale_qk_coeff != 1.0: product = product.scale(self.scale_qk_coeff) + # NOTE : save and check product after scale if attn_mask is not None: product = product + attn_mask + # NOTE : save and check product after adding attn_mask weights = F.softmax(product) + # NOTE : save and check weights after softmax else: weights = incubate.softmax_mask_fuse_upper_triangle(product) @@ -266,10 +321,12 @@ def core_attn(self, q, k, v, attn_mask=None): mode="upscale_in_train") out = paddle.matmul(weights, v) + # NOTE : save and check out after matmul # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, -1]) + # NOTE : save and check out after transpose/reshape return out, weights @@ -287,12 +344,30 @@ def forward(self, key = query if key is None else key value = query if value is None else value # compute q ,k ,v - if self.fuse_attn_qkv: - q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache) + if use_cache is False: + if self.fuse_attn_qkv: + q, k, v = self._fuse_prepare_qkv(query, use_cache, cache) + # NOTE : save and check q/k/v + else: + q, k, v = self._prepare_qkv(query, key, value, use_cache, + cache) + # NOTE : save and check q/k/v else: - q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, - cache) - + if self.fuse_attn_qkv: + q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, + cache) + else: + q, k, v, cache = self._prepare_qkv(query, key, value, + use_cache, cache) + + if save_intermediate: + np.save("check_precision/005attention_q_layer%04d" % idx, + paddle.cast(q, 'float32').numpy()) + q.register_hook(lambda grad: np.save("check_precision/grad005attention_q_layer%04d"%idx, paddle.cast(grad, 'float32').numpy())) + #print("attention q_layer", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [q, ]]) if self.use_recompute and self.recompute_granularity == "core_attn" and self.do_recompute: out, weights = recompute(self.core_attn, q, k, v, attn_mask) elif self.use_flash_attn and attn_mask is None: @@ -300,8 +375,19 @@ def forward(self, else: out, weights = self.core_attn(q, k, v, attn_mask=attn_mask) + if save_intermediate: + np.save("check_precision/006core_attn_out%04d" % idx, + paddle.cast(out, 'float32').numpy()) + out.register_hook(lambda grad: np.save("check_precision/grad006core_attn_out%04d"%idx, paddle.cast(grad, 'float32').numpy())) + # NOTE : save and check out after core_attn + #print("core_attn out", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [out, ]]) + # project to output out = self.out_proj(out) + # NOTE : save and check out after out_proj outs = [out] if self.need_weights: @@ -379,8 +465,9 @@ def forward(self, cache=cache[i]) new_caches.append(new_cache) - if self.norm is not None: - output = self.norm(output) + #if self.norm is not None: + # output = self.norm(output) + # NOTE : save and check output after norm return output if use_cache is False else (output, new_caches) def gen_cache(self, memory, do_zip=False): @@ -510,8 +597,29 @@ def __init__(self, def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) + #self.linear1.weight.register_hook(lambda grad: np.save("check_precision/linear1_weight_grad%04d"%idx, grad.numpy())) + #self.linear2.weight.register_hook(lambda grad: np.save("check_precision/linear2_weight_grad%04d"%idx, grad.numpy())) + if save_intermediate: + np.save("check_precision/000hidden_states_start%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad000hidden_states_start%04d"%idx, paddle.cast(grad, 'float32').numpy())) + #print("hidden_states start", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [tgt, ]]) + # if self.normalize_before: + # tgt = self.norm1(tgt) + # NOTE : save and check tgt after norm1 + + # print(self.norm1, self.norm1.weight, self.norm1.bias) + if save_intermediate: + np.save("check_precision/001hidden_norm_before%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad001hidden_norm_before%04d"%idx, paddle.cast(grad, 'float32').numpy())) + #print("hidden norm before", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [tgt, ]]) if use_cache is False: if self.use_recompute and self.recompute_granularity == "full_attn" and self.do_recompute: @@ -522,27 +630,84 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) - tgt = residual + self.dropout1(tgt) - if not self.normalize_before: - tgt = self.norm1(tgt) - + # NOTE : save and check attention_output + if save_intermediate: + np.save("check_precision/007attention_output%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad007atteition_output%04d"%idx, paddle.cast(grad, 'float32').numpy())) + #print("attention_output", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [tgt, ]]) + + tgt_dropout = self.dropout1(tgt) + if save_intermediate: + np.save("check_precision/008dropout1%04d" % idx, + paddle.cast(tgt_dropout, 'float32').numpy()) + tgt_dropout.register_hook(lambda grad: np.save("check_precision/grad008dropout1%04d"%idx, paddle.cast(grad, 'float32').numpy())) + tgt = residual + tgt_dropout + + #tgt = residual + self.dropout1(tgt) + # NOTE : save and check tgt after add_dropout + + #if not self.normalize_before: + # tgt = self.norm1(tgt) + + if save_intermediate: + np.save("check_precision/0081tgt_norm2%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad:np.save("check_precision/grad0081tgt_norm2%04d"%idx, paddle.cast(grad, 'float32').numpy())) residual = tgt - if self.normalize_before: - tgt = self.norm2(tgt) + #if self.normalize_before: + # tgt = self.norm2(tgt) + # np.save("check_precision/009tgt_norm2%04d"%idx, paddle.cast(tgt, 'float32').numpy()) + # tgt.register_hook(lambda grad: np.save("check_precision/grad009tgt_norm2%04d"%idx, paddle.cast(grad, 'float32').numpy())) + + # NOTE : save and check tgt after norm2 # if self.expert_mode: # tgt = self.moe_mlp(tgt) if self.num_experts > 1: tgt = self.moe_mlp(tgt) else: - tgt = self.dropout2( - self.linear2(self.activation(self.linear1(tgt)))) + #tgt = self.dropout2( + # self.linear2(self.activation(self.linear1(tgt)))) + tgt = self.linear1(tgt) + if save_intermediate: + np.save("check_precision/009zzlinear1%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad009zzlinear1%04d"%idx, paddle.cast(grad, 'float32').numpy())) + tgt = self.activation(tgt) + if save_intermediate: + np.save("check_precision/009zzzactivation%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad009zzzactivation%04d"%idx, paddle.cast(grad, 'float32').numpy())) + tgt = self.linear2(tgt) + if save_intermediate: + np.save("check_precision/0091mlp_output%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad0091mlp_output%04d"%idx, paddle.cast(grad, 'float32').numpy())) + tgt = self.dropout2(tgt) + if save_intermediate: + np.save("check_precision/0092dropout2%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad0092dropout2%04d"%idx, paddle.cast(grad, 'float32').numpy())) + # NOTE : divide the operations and save/check separately + # NOTE : out = self.linear1(tgt) + # NOTE : out = self.activation(out) + # NOTE : out = self.linear2(out) + # NOTE : tgt = self.dropout2(out) tgt = residual + tgt + # NOTE : save and check tgt after add_op - if not self.normalize_before: - tgt = self.norm2(tgt) + #if not self.normalize_before: + # tgt = self.norm2(tgt) + if save_intermediate: + np.save("check_precision/010output%04d" % idx, + paddle.cast(tgt, 'float32').numpy()) + tgt.register_hook(lambda grad: np.save("check_precision/grad010output%04d"%idx, paddle.cast(grad, 'float32').numpy())) return tgt if use_cache is False else (tgt, incremental_cache) def gen_cache(self, memory): @@ -589,10 +754,15 @@ def forward(self, input_ids, position_ids=None): seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones + # NOTE : save and check input_ids input_embedings = self.word_embeddings(input_ids) + # NOTE : save and check input_embedings position_embeddings = self.position_embeddings(position_ids) + # NOTE : save and check position_embeddings embeddings = input_embedings + position_embeddings + # NOTE : save and check embeddings embeddings = self.dropout(embeddings) + # no dropout return embeddings @@ -712,6 +882,9 @@ def forward(self, use_cache=False, cache=None): + global idx + idx += 1 + if position_ids is None: past_length = 0 if cache is not None: @@ -723,10 +896,24 @@ def forward(self, position_ids = position_ids.unsqueeze(0) # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) + + # NOTE : save and check input_ids/positon ids + #np.save("check_precision/input_ids", input_ids.numpy()) + #print("input_ids", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [input_ids, ]]) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids) - # fused_soiftmax_with_triangular is only suppported on GPU/DCU. + #np.save("check_precision/embedding_output",embedding_output.numpy()) + #print("*****%d*****"%idx) + #np.save("check_precision/embedding_output%d"%idx,embedding_output.numpy()) + #print("embedding_output", [{ + # "sum": x.abs().sum().item(), + # "mean": x.abs().mean().item() + #} for x in [embedding_output, ]]) + fused_softmax_with_triangular = strtobool( os.getenv("fused_softmax_with_triangular", True)) # fused_softmax_with_triangular is only suppported on GPU/DCU. @@ -748,6 +935,8 @@ def forward(self, # The tensor returned by triu not in static graph. attention_mask.stop_gradient = True + # NOTE : save and check attention_mask + encoder_outputs = self.decoder( embedding_output, memory=None, @@ -757,6 +946,8 @@ def forward(self, use_cache=use_cache, cache=cache) + # NOTE : save and check encoder_outputs + return encoder_outputs @@ -795,7 +986,13 @@ def forward(self, encoder_outputs, get_attr(self.gpt.embeddings.word_embeddings, "weight"), transpose_y=True) + # NOTE : save and check logits after matmul_op + #global idx + if save_intermediate: + np.save("check_precision/011gpt_output%04d" % idx, + paddle.cast(logits, 'float32').numpy()) + logits.register_hook(lambda grad: np.save("check_precision/grad011gpt_output%04d"%idx, paddle.cast(grad, 'float32').numpy())) if use_cache: return logits, cached_kvs else: @@ -830,12 +1027,17 @@ def forward(self, prediction_scores, masked_lm_labels, loss_mask): Tensor: The pretraining loss. Its data type should be float32 and its shape is [1]. """ + #global idx + #idx += 1 + if idx == 20000: + exit(0) masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2)) loss_mask = loss_mask.reshape([-1]) masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask) loss = masked_lm_loss / loss_mask.sum() + breakpoint() return loss diff --git a/ppfleetx/optims/lr_scheduler.py b/ppfleetx/optims/lr_scheduler.py index da6b3f26f..17d569e28 100644 --- a/ppfleetx/optims/lr_scheduler.py +++ b/ppfleetx/optims/lr_scheduler.py @@ -58,20 +58,20 @@ def get_lr(self): coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) return self.min_lr + coeff * (self.max_lr - self.min_lr) - # def step(self, epoch=None): - # if epoch is None: - # self.last_epoch += 0 - # self.last_lr = self.get_lr() - # else: - # self.last_epoch += epoch - # if hasattr(self, "_get_closed_form_lr"): - # self.last_lr = self._get_closed_form_lr() - # else: - # self.last_lr = self.get_lr() - - # if self.verbose: - # print('Epoch {}: {} set learning rate to {}.'.format( - # self.last_epoch, self.__class__.__name__, self.last_lr)) + def step(self, epoch=None): + if epoch is None: + self.last_epoch += 0 + self.last_lr = self.get_lr() + else: + self.last_epoch += epoch + if hasattr(self, "_get_closed_form_lr"): + self.last_lr = self._get_closed_form_lr() + else: + self.last_lr = self.get_lr() + + if self.verbose: + print('Epoch {}: {} set learning rate to {}.'.format( + self.last_epoch, self.__class__.__name__, self.last_lr)) class LinearDecayWithWarmup(LRScheduler): From d23907730d38fd4f76ca520ff9445186dc445635 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Mon, 6 Mar 2023 01:47:01 +0000 Subject: [PATCH 13/25] update for exps part3 --- .../gpt/dygraph/single_model.py | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/ppfleetx/models/language_model/gpt/dygraph/single_model.py index aaeee67a4..5e2b27761 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -174,7 +174,7 @@ def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): # "mean": x.abs().mean().item() #} for x in [mix_layer, ]]) mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim]) - mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) + # mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) if save_intermediate: np.save("check_precision/004mix_layer_trans%04d" % idx, paddle.cast(mix_layer, 'float32').numpy()) @@ -209,7 +209,7 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): q = self.q_proj(query) # NOTE : save and check q q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim]) - q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) + # q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) # NOTE : save and check q after reshape and transpose if isinstance(cache, self.StaticCache): @@ -245,10 +245,10 @@ def compute_kv(self, key, value): v = self.v_proj(value) # NOTE : save and check v k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim]) - k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) + # k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) # NOTE : save and check k after reshape and transpose v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim]) - v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) + # v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) # NOTE : save and check v after reshape and transpose return k, v @@ -346,11 +346,12 @@ def forward(self, # compute q ,k ,v if use_cache is False: if self.fuse_attn_qkv: - q, k, v = self._fuse_prepare_qkv(query, use_cache, cache) + q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, + cache) # NOTE : save and check q/k/v else: - q, k, v = self._prepare_qkv(query, key, value, use_cache, - cache) + q, k, v, cache = self._prepare_qkv(query, key, value, + use_cache, cache) # NOTE : save and check q/k/v else: if self.fuse_attn_qkv: @@ -465,7 +466,7 @@ def forward(self, cache=cache[i]) new_caches.append(new_cache) - #if self.norm is not None: + # if self.norm is not None: # output = self.norm(output) # NOTE : save and check output after norm return output if use_cache is False else (output, new_caches) @@ -607,10 +608,11 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): # "sum": x.abs().sum().item(), # "mean": x.abs().mean().item() #} for x in [tgt, ]]) + # if self.normalize_before: # tgt = self.norm1(tgt) - # NOTE : save and check tgt after norm1 + # NOTE : save and check tgt after norm1 # print(self.norm1, self.norm1.weight, self.norm1.bias) if save_intermediate: np.save("check_precision/001hidden_norm_before%04d" % idx, @@ -650,7 +652,7 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): #tgt = residual + self.dropout1(tgt) # NOTE : save and check tgt after add_dropout - #if not self.normalize_before: + # if not self.normalize_before: # tgt = self.norm1(tgt) if save_intermediate: @@ -658,7 +660,7 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): paddle.cast(tgt, 'float32').numpy()) tgt.register_hook(lambda grad:np.save("check_precision/grad0081tgt_norm2%04d"%idx, paddle.cast(grad, 'float32').numpy())) residual = tgt - #if self.normalize_before: + # if self.normalize_before: # tgt = self.norm2(tgt) # np.save("check_precision/009tgt_norm2%04d"%idx, paddle.cast(tgt, 'float32').numpy()) # tgt.register_hook(lambda grad: np.save("check_precision/grad009tgt_norm2%04d"%idx, paddle.cast(grad, 'float32').numpy())) @@ -701,7 +703,7 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): tgt = residual + tgt # NOTE : save and check tgt after add_op - #if not self.normalize_before: + # if not self.normalize_before: # tgt = self.norm2(tgt) if save_intermediate: @@ -1037,7 +1039,6 @@ def forward(self, prediction_scores, masked_lm_labels, loss_mask): loss_mask = loss_mask.reshape([-1]) masked_lm_loss = paddle.sum(masked_lm_loss.reshape([-1]) * loss_mask) loss = masked_lm_loss / loss_mask.sum() - breakpoint() return loss From b8606bb772e8dd00f051a92f7b1a6958c7df349c Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Mon, 6 Mar 2023 03:17:51 +0000 Subject: [PATCH 14/25] update for exps part4 --- ppfleetx/core/engine/eager_engine.py | 2 +- .../gpt/dygraph/single_model.py | 219 +----------------- 2 files changed, 9 insertions(+), 212 deletions(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index af0443ba5..67a2a9104 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -23,7 +23,7 @@ import paddle.distributed.fleet as fleet from paddle.optimizer.lr import LRScheduler -from paddle.fluid.dygraph.parallel import sync_params_buffers +from paddle.distributed.parallel import sync_params_buffers from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients from paddle.profiler import SummaryView from paddle.distributed.fleet.meta_parallel import TensorParallel diff --git a/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/ppfleetx/models/language_model/gpt/dygraph/single_model.py index 5e2b27761..3cdc170d6 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -38,9 +38,6 @@ from ppfleetx.models.language_model.moe import MoELayer from ppfleetx.models.language_model.moe_exp.layer import MoE -idx = 0 -save_intermediate = False - from ppfleetx.utils.log import logger try: from paddle.nn.functional.flash_attention import flash_attention @@ -146,45 +143,9 @@ def __init__(self, embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): - # NOTE : save and check attention qkv - - #print("attention qkv debug: ", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [ - # query, - # self.qkv_proj.weight, - # self.qkv_proj.bias, - #]]) - if save_intermediate: - np.save("check_precision/002query%04d" % idx, - paddle.cast(query, 'float32').numpy()) - query.register_hook(lambda grad: np.save("check_precision/grad002query%04d"%idx, paddle.cast(grad, 'float32').numpy())) - mix_layer = self.qkv_proj(query) - - if save_intermediate: - np.save("check_precision/003mix_layer%04d" % idx, - paddle.cast(mix_layer, 'float32').numpy()) - mix_layer.register_hook(lambda grad: np.save("check_precision/grad003mix_layer%04d"%idx, paddle.cast(grad, 'float32').numpy())) - - # NOTE : save and check mix_layer - #print("attention mix_layer", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [mix_layer, ]]) mix_layer = paddle.reshape_(mix_layer, [0, 0, -1, 3 * self.head_dim]) - # mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) - if save_intermediate: - np.save("check_precision/004mix_layer_trans%04d" % idx, - paddle.cast(mix_layer, 'float32').numpy()) - mix_layer.register_hook(lambda grad: np.save("check_precision/grad004mix_layer_trans%04d"%idx, paddle.cast(grad, 'float32').numpy())) - # NOTE : save and check mix_layer after reshape and transpose q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) - # NOTE : save and check q, k, v - #np.save("check_precision/q_split%04d"%idx, q.numpy()) - #np.save("check_precision/k_split%04d"%idx, k.numpy()) - #np.save("check_precision/v_split%04d"%idx, v.numpy()) assert not isinstance( cache, self.StaticCache @@ -207,17 +168,13 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): """ q = self.q_proj(query) - # NOTE : save and check q q = tensor.reshape(x=q, shape=[0, 0, -1, self.head_dim]) - # q = tensor.transpose(x=q, perm=[0, 2, 1, 3]) - # NOTE : save and check q after reshape and transpose if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: k, v = self.compute_kv(key, value) - # NOTE : save and check k/v if isinstance(cache, self.Cache): # for decoder self-attention in inference @@ -241,15 +198,9 @@ def compute_kv(self, key, value): """ k = self.k_proj(key) - # NOTE : save and check k v = self.v_proj(value) - # NOTE : save and check v k = tensor.reshape(x=k, shape=[0, 0, -1, self.head_dim]) - # k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) - # NOTE : save and check k after reshape and transpose v = tensor.reshape(x=v, shape=[0, 0, -1, self.head_dim]) - # v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) - # NOTE : save and check v after reshape and transpose return k, v def gen_cache(self, key, value=None, type=Cache): @@ -299,17 +250,12 @@ def core_attn(self, q, k, v, attn_mask=None): product = paddle.matmul( x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True) - # NOTE : save and check product after matmul - if self.scale_qk_coeff != 1.0: product = product.scale(self.scale_qk_coeff) - # NOTE : save and check product after scale if attn_mask is not None: product = product + attn_mask - # NOTE : save and check product after adding attn_mask weights = F.softmax(product) - # NOTE : save and check weights after softmax else: weights = incubate.softmax_mask_fuse_upper_triangle(product) @@ -321,12 +267,10 @@ def core_attn(self, q, k, v, attn_mask=None): mode="upscale_in_train") out = paddle.matmul(weights, v) - # NOTE : save and check out after matmul # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, -1]) - # NOTE : save and check out after transpose/reshape return out, weights @@ -344,31 +288,12 @@ def forward(self, key = query if key is None else key value = query if value is None else value # compute q ,k ,v - if use_cache is False: - if self.fuse_attn_qkv: - q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, - cache) - # NOTE : save and check q/k/v - else: - q, k, v, cache = self._prepare_qkv(query, key, value, - use_cache, cache) - # NOTE : save and check q/k/v + if self.fuse_attn_qkv: + q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache) else: - if self.fuse_attn_qkv: - q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, - cache) - else: - q, k, v, cache = self._prepare_qkv(query, key, value, - use_cache, cache) - - if save_intermediate: - np.save("check_precision/005attention_q_layer%04d" % idx, - paddle.cast(q, 'float32').numpy()) - q.register_hook(lambda grad: np.save("check_precision/grad005attention_q_layer%04d"%idx, paddle.cast(grad, 'float32').numpy())) - #print("attention q_layer", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [q, ]]) + q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, + cache) + if self.use_recompute and self.recompute_granularity == "core_attn" and self.do_recompute: out, weights = recompute(self.core_attn, q, k, v, attn_mask) elif self.use_flash_attn and attn_mask is None: @@ -376,19 +301,8 @@ def forward(self, else: out, weights = self.core_attn(q, k, v, attn_mask=attn_mask) - if save_intermediate: - np.save("check_precision/006core_attn_out%04d" % idx, - paddle.cast(out, 'float32').numpy()) - out.register_hook(lambda grad: np.save("check_precision/grad006core_attn_out%04d"%idx, paddle.cast(grad, 'float32').numpy())) - # NOTE : save and check out after core_attn - #print("core_attn out", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [out, ]]) - # project to output out = self.out_proj(out) - # NOTE : save and check out after out_proj outs = [out] if self.need_weights: @@ -468,7 +382,6 @@ def forward(self, # if self.norm is not None: # output = self.norm(output) - # NOTE : save and check output after norm return output if use_cache is False else (output, new_caches) def gen_cache(self, memory, do_zip=False): @@ -598,31 +511,9 @@ def __init__(self, def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): residual = tgt - #self.linear1.weight.register_hook(lambda grad: np.save("check_precision/linear1_weight_grad%04d"%idx, grad.numpy())) - #self.linear2.weight.register_hook(lambda grad: np.save("check_precision/linear2_weight_grad%04d"%idx, grad.numpy())) - if save_intermediate: - np.save("check_precision/000hidden_states_start%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad000hidden_states_start%04d"%idx, paddle.cast(grad, 'float32').numpy())) - #print("hidden_states start", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [tgt, ]]) - # if self.normalize_before: # tgt = self.norm1(tgt) - # NOTE : save and check tgt after norm1 - # print(self.norm1, self.norm1.weight, self.norm1.bias) - if save_intermediate: - np.save("check_precision/001hidden_norm_before%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad001hidden_norm_before%04d"%idx, paddle.cast(grad, 'float32').numpy())) - #print("hidden norm before", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [tgt, ]]) - if use_cache is False: if self.use_recompute and self.recompute_granularity == "full_attn" and self.do_recompute: tgt = recompute(self.self_attn, tgt, None, None, tgt_mask, @@ -632,84 +523,26 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None): else: tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) - # NOTE : save and check attention_output - if save_intermediate: - np.save("check_precision/007attention_output%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad007atteition_output%04d"%idx, paddle.cast(grad, 'float32').numpy())) - #print("attention_output", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [tgt, ]]) - - tgt_dropout = self.dropout1(tgt) - if save_intermediate: - np.save("check_precision/008dropout1%04d" % idx, - paddle.cast(tgt_dropout, 'float32').numpy()) - tgt_dropout.register_hook(lambda grad: np.save("check_precision/grad008dropout1%04d"%idx, paddle.cast(grad, 'float32').numpy())) - tgt = residual + tgt_dropout - - #tgt = residual + self.dropout1(tgt) - # NOTE : save and check tgt after add_dropout - + tgt = residual + self.dropout1(tgt) # if not self.normalize_before: # tgt = self.norm1(tgt) - - if save_intermediate: - np.save("check_precision/0081tgt_norm2%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad:np.save("check_precision/grad0081tgt_norm2%04d"%idx, paddle.cast(grad, 'float32').numpy())) residual = tgt # if self.normalize_before: # tgt = self.norm2(tgt) - # np.save("check_precision/009tgt_norm2%04d"%idx, paddle.cast(tgt, 'float32').numpy()) - # tgt.register_hook(lambda grad: np.save("check_precision/grad009tgt_norm2%04d"%idx, paddle.cast(grad, 'float32').numpy())) - - # NOTE : save and check tgt after norm2 # if self.expert_mode: # tgt = self.moe_mlp(tgt) if self.num_experts > 1: tgt = self.moe_mlp(tgt) else: - #tgt = self.dropout2( - # self.linear2(self.activation(self.linear1(tgt)))) - tgt = self.linear1(tgt) - if save_intermediate: - np.save("check_precision/009zzlinear1%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad009zzlinear1%04d"%idx, paddle.cast(grad, 'float32').numpy())) - tgt = self.activation(tgt) - if save_intermediate: - np.save("check_precision/009zzzactivation%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad009zzzactivation%04d"%idx, paddle.cast(grad, 'float32').numpy())) - tgt = self.linear2(tgt) - if save_intermediate: - np.save("check_precision/0091mlp_output%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad0091mlp_output%04d"%idx, paddle.cast(grad, 'float32').numpy())) - tgt = self.dropout2(tgt) - if save_intermediate: - np.save("check_precision/0092dropout2%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad0092dropout2%04d"%idx, paddle.cast(grad, 'float32').numpy())) - # NOTE : divide the operations and save/check separately - # NOTE : out = self.linear1(tgt) - # NOTE : out = self.activation(out) - # NOTE : out = self.linear2(out) - # NOTE : tgt = self.dropout2(out) + tgt = self.dropout2( + self.linear2(self.activation(self.linear1(tgt)))) tgt = residual + tgt - # NOTE : save and check tgt after add_op # if not self.normalize_before: # tgt = self.norm2(tgt) - if save_intermediate: - np.save("check_precision/010output%04d" % idx, - paddle.cast(tgt, 'float32').numpy()) - tgt.register_hook(lambda grad: np.save("check_precision/grad010output%04d"%idx, paddle.cast(grad, 'float32').numpy())) return tgt if use_cache is False else (tgt, incremental_cache) def gen_cache(self, memory): @@ -756,15 +589,10 @@ def forward(self, input_ids, position_ids=None): seq_length = paddle.cumsum(ones, axis=-1) position_ids = seq_length - ones - # NOTE : save and check input_ids input_embedings = self.word_embeddings(input_ids) - # NOTE : save and check input_embedings position_embeddings = self.position_embeddings(position_ids) - # NOTE : save and check position_embeddings embeddings = input_embedings + position_embeddings - # NOTE : save and check embeddings embeddings = self.dropout(embeddings) - # no dropout return embeddings @@ -884,9 +712,6 @@ def forward(self, use_cache=False, cache=None): - global idx - idx += 1 - if position_ids is None: past_length = 0 if cache is not None: @@ -899,23 +724,9 @@ def forward(self, # .expand_as(input_ids) position_ids = paddle.expand_as(position_ids, input_ids) - # NOTE : save and check input_ids/positon ids - #np.save("check_precision/input_ids", input_ids.numpy()) - #print("input_ids", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [input_ids, ]]) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids) - #np.save("check_precision/embedding_output",embedding_output.numpy()) - #print("*****%d*****"%idx) - #np.save("check_precision/embedding_output%d"%idx,embedding_output.numpy()) - #print("embedding_output", [{ - # "sum": x.abs().sum().item(), - # "mean": x.abs().mean().item() - #} for x in [embedding_output, ]]) - fused_softmax_with_triangular = strtobool( os.getenv("fused_softmax_with_triangular", True)) # fused_softmax_with_triangular is only suppported on GPU/DCU. @@ -937,8 +748,6 @@ def forward(self, # The tensor returned by triu not in static graph. attention_mask.stop_gradient = True - # NOTE : save and check attention_mask - encoder_outputs = self.decoder( embedding_output, memory=None, @@ -948,8 +757,6 @@ def forward(self, use_cache=use_cache, cache=cache) - # NOTE : save and check encoder_outputs - return encoder_outputs @@ -988,13 +795,7 @@ def forward(self, encoder_outputs, get_attr(self.gpt.embeddings.word_embeddings, "weight"), transpose_y=True) - # NOTE : save and check logits after matmul_op - #global idx - if save_intermediate: - np.save("check_precision/011gpt_output%04d" % idx, - paddle.cast(logits, 'float32').numpy()) - logits.register_hook(lambda grad: np.save("check_precision/grad011gpt_output%04d"%idx, paddle.cast(grad, 'float32').numpy())) if use_cache: return logits, cached_kvs else: @@ -1029,10 +830,6 @@ def forward(self, prediction_scores, masked_lm_labels, loss_mask): Tensor: The pretraining loss. Its data type should be float32 and its shape is [1]. """ - #global idx - #idx += 1 - if idx == 20000: - exit(0) masked_lm_loss = self.loss_func(prediction_scores, masked_lm_labels.unsqueeze(2)) From bc951b2d4dafb6f74e260a3d4d13355b17fd79e8 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Wed, 8 Mar 2023 06:15:25 +0000 Subject: [PATCH 15/25] update --- .../configs/nlp/gpt/pretrain_gpt_base.yaml | 1 + .../models/language_model/language_module.py | 24 +++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index b2f34e1c3..d7f5c0ead 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -32,6 +32,7 @@ Engine: Model: module: "GPTModule" name: "GPT" + vocab_size_divisible_unit: 128 fused_linear: False fuse_attn_qkv: True scale_qk_by_layer_num: True diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index 474134f96..648aaadeb 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os import sys import copy @@ -127,6 +128,16 @@ def get_model_size(self, l, h, v, s): logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0)) + def vocab_size_with_padding(self, vocab_size, div_unit, mp_degree): + padded_size = vocab_size + multiple = div_unit * mp_degree + while (padded_size % multiple) != 0: + padded_size += 1 + logging.warning(' > padded vocab (size: {}) with {} dummy tokens ' + '(new size: {})'.format(vocab_size, padded_size - + vocab_size, padded_size)) + return padded_size + def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) @@ -150,16 +161,21 @@ def get_model(self): model_setting['freeze_embedding'] = freeze_embedding model_setting.pop("module") + model_name = model_setting.pop("name") + tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] + self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) + + model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting.get('vocab_size', self.tokenizer.vocab_size), + model_setting.pop('vocab_size_divisible_unit'), + self.configs.Distributed.get('mp_degree', 1)) + l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = self.configs.Data.Train.dataset.max_seq_len self.get_model_size(l, h, v, s) - model_name = model_setting.pop("name") - tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] - self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) - if self.nranks == 1: model_setting.pop("sequence_parallel") model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) From 43023a55e337323633e0dca5ebcb8d5cc7dc4ac0 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Wed, 8 Mar 2023 13:25:28 +0000 Subject: [PATCH 16/25] update --- ppfleetx/distributed/apis/env.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/ppfleetx/distributed/apis/env.py b/ppfleetx/distributed/apis/env.py index 13470c4d3..d9855632b 100644 --- a/ppfleetx/distributed/apis/env.py +++ b/ppfleetx/distributed/apis/env.py @@ -42,9 +42,14 @@ def set_seed(seed): else: mp_rank, pp_rank, data_world_rank, data_world_size = 0, 0, 0, 1 - random.seed(seed + data_world_rank) - np.random.seed(seed + data_world_rank) - paddle.seed(seed + data_world_rank) + seed += 100 * pp_rank + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + + # random.seed(seed + data_world_rank) + # np.random.seed(seed + data_world_rank) + # paddle.seed(seed + data_world_rank) # local_seed/ global_seed is used to control dropout in ModelParallel local_seed = seed + 123 + mp_rank * 10 + pp_rank * 1000 + data_world_size From a589a35c5dbdfe1047ed5e79908bef67d37eeb83 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Fri, 10 Mar 2023 03:17:05 +0000 Subject: [PATCH 17/25] update --- ppfleetx/core/engine/eager_engine.py | 18 +++++++++++------- ppfleetx/distributed/apis/amp.py | 19 +++---------------- .../models/language_model/language_module.py | 5 +++-- 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 67a2a9104..346b01e7c 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -16,6 +16,7 @@ import time import sys import logging +from tokenize import group import paddle import paddle.nn as nn @@ -331,7 +332,8 @@ def _train_one_epoch(self, skip_first = True # Note(GuoxiaWang): Do not use len(train_data_loader()), # it will cause a memory leak. - total_train_batch = len(train_data_loader) + total_train_batch = self._max_steps if self._run_mode == 'step' else len( + train_data_loader) total_train_step = self._max_steps if self._run_mode == 'step' else total_train_batch * self._num_train_epochs total_eval_batch = len( valid_data_loader) if valid_data_loader is not None else 0 @@ -347,6 +349,11 @@ def _train_one_epoch(self, loss = self._fit_impl(batch) train_losses.append(loss) + if self._lr_scheduler is not None and self._lr_scheduler_mode == 'step': + # TODO: if update_successful + if self._scaler is None or self._scaler._found_inf == 0: + self._lr_scheduler.step(epoch=self._global_batch_size) + if (step + 1) % self._logging_freq == 0: train_step_cost = get_timestamp() - train_step_start numpy_losses = [float(loss) for loss in train_losses] @@ -359,7 +366,9 @@ def _train_one_epoch(self, 'train_cost': train_step_cost if step == 0 else train_step_cost / self._logging_freq, 'loss': sum(numpy_losses) / len(numpy_losses), - 'lr': self._optimizer.get_lr() + 'lr': self._optimizer.get_lr(), + 'found_inf': self._scaler._found_inf + if self._scaler is not None else 0, } if self._amp_enable: log_dict['loss_scale'] = self._scaler._scale.numpy()[0] @@ -368,11 +377,6 @@ def _train_one_epoch(self, train_step_start = get_timestamp() train_losses = [] - if self._lr_scheduler is not None and self._lr_scheduler_mode == 'step': - # TODO: if update_successful - if self._scaler is None or self._scaler._found_inf == 0: - self._lr_scheduler.step(epoch=self._global_batch_size) - self._optimizer.clear_grad() if self._run_mode == 'step' and not skip_first: diff --git a/ppfleetx/distributed/apis/amp.py b/ppfleetx/distributed/apis/amp.py index 5eecca75d..cd906b964 100644 --- a/ppfleetx/distributed/apis/amp.py +++ b/ppfleetx/distributed/apis/amp.py @@ -40,8 +40,6 @@ def __init__(self, layers, dtype="float16"): if not param.stop_gradient and not hasattr(param, "main_grad"): setattr(param, "main_grad", None) param._register_grad_hook(self._update_main_grad_hook(param)) - # TODO: remove _release_grad_hook after solving the issue in _update_main_grad_hook - param._register_backward_hook(self._release_grad_hook(param)) def _update_main_grad_hook(self, param): """Create the update_main_grad hook for backprop.""" @@ -49,8 +47,8 @@ def _update_main_grad_hook(self, param): # Hook used for back-prop and grad-merge. @paddle.autograd.no_grad() def param_hook(tmp_grad): - # TODO: cancel the comments of the checking code - # assert param.grad is None, "param.grad is not None" + assert param.grad is None, \ + "In main_grad node, param.grad should be None, but find param[{}] has grad.".format(param.name) if param.main_grad is None: param.main_grad = core.eager.Tensor( value=tmp_grad.cast(paddle.float32).value(), @@ -59,22 +57,11 @@ def param_hook(tmp_grad): else: param.main_grad.add_(tmp_grad.cast(paddle.float32)) - # NOTE: It doesn't work. - # param.clear_gradient(False) + tmp_grad._clear_data() return None return param_hook - def _release_grad_hook(self, param): - """Create the release_main_grad hook for backprop.""" - - # Hook used for back-prop and grad-merge. - @paddle.autograd.no_grad() - def release_hook(*_): - param.clear_gradient(False) - - return release_hook - def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index 648aaadeb..8036eb234 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -81,9 +81,10 @@ def training_step_end(self, log_dict): None) is not None else "" logger.info( "[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, " \ - "ips_total: %.0f tokens/s, ips: %.0f tokens/s, %s learning rate: %.5e" + "ips_total: %.0f tokens/s, ips: %.0f tokens/s, %s learning rate: %.5e, found_inf: %.0f" % (log_dict['epoch'], log_dict['total_epoch'], log_dict['batch'], log_dict['total_step'], log_dict['loss'], - log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, loss_scale_str, log_dict['lr'])) + log_dict['train_cost'], speed, speed * default_global_tokens_num, speed * default_global_tokens_num / self.data_world_size, \ + loss_scale_str, log_dict['lr'], log_dict['found_inf'])) def validation_step(self, batch): tokens, position_ids, labels, loss_mask = batch From 1ea9406be6007d6f0afca9986d6fdcc12729e6ff Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 14 Mar 2023 12:53:11 +0000 Subject: [PATCH 18/25] update --- ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py | 2 +- ppfleetx/models/language_model/gpt/dygraph/single_model.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index 89f8e5441..8907884bd 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -849,7 +849,7 @@ def forward(self, # fused_soiftmax_with_triangular is only suppported on GPU/DCU. fused_softmax_with_triangular = strtobool( - os.getenv("fused_softmax_with_triangular", True)) + os.getenv("fused_softmax_with_triangular", 'True')) # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. if not fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( diff --git a/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/ppfleetx/models/language_model/gpt/dygraph/single_model.py index 8bae7a8a4..89c26b325 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -728,7 +728,7 @@ def forward(self, input_ids=input_ids, position_ids=position_ids) fused_softmax_with_triangular = strtobool( - os.getenv("fused_softmax_with_triangular", True)) + os.getenv("fused_softmax_with_triangular", 'True')) # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. if not fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( From ba932a9d6d9d58091850995a73176371890a9b8b Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 14 Mar 2023 13:19:04 +0000 Subject: [PATCH 19/25] update --- ppfleetx/core/engine/eager_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 346b01e7c..7dceafdb4 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -142,7 +142,7 @@ def configure_optimizers(self): self._amp_dtype = amp_config.get('dtype', 'float16') self._amp_level = amp_config.get('level', 'O2') - self._use_main_grad = amp_config.get('use_main_grad', True) + self._use_main_grad = amp_config.get('use_main_grad', False) self._scale_loss = amp_config['scale_loss'] self._custom_black_list = amp_config['custom_black_list'] self._custom_white_list = amp_config['custom_white_list'] From ea7779787abc3bf65cb27ef76076e6ad492ef8d9 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Tue, 14 Mar 2023 14:13:06 +0000 Subject: [PATCH 20/25] update --- .../models/language_model/language_module.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index 8036eb234..66eb63528 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -273,6 +273,11 @@ def get_model(self): num_classes = model_setting.pop("num_classes", 2) assert pretrained is not None + model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting.get('vocab_size', self.tokenizer.vocab_size), + model_setting.pop('vocab_size_divisible_unit'), + self.configs.Distributed.get('mp_degree', 1)) + l = model_setting['num_layers'] h = model_setting['hidden_size'] v = model_setting['vocab_size'] @@ -512,6 +517,11 @@ def get_model(self): tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) + model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting.get('vocab_size', self.tokenizer.vocab_size), + model_setting.pop('vocab_size_divisible_unit'), + self.configs.Distributed.get('mp_degree', 1)) + if self.nranks == 1: model = gpt.GPTForGeneration( gpt.GPTModel(**model_setting), self.generation_cfgs) @@ -632,7 +642,15 @@ def get_model(self): model_setting['skip_tensor_map'] = skip_tensor_map model_setting['freeze_embedding'] = freeze_embedding model_setting.pop("module") - model_setting.pop("name") + + model_name = model_setting.pop("name") + tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] + self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) + + model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting.get('vocab_size', self.tokenizer.vocab_size), + model_setting.pop('vocab_size_divisible_unit'), + self.configs.Distributed.get('mp_degree', 1)) if self.nranks == 1: model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting)) From 325d81c3da96214f187ee8a52be63a537ec55505 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Wed, 15 Mar 2023 07:09:05 +0000 Subject: [PATCH 21/25] update --- docs/standard.md | 12 ++- .../transformer/models/GPT/docs/README.md | 12 ++- .../configs/nlp/gpt/pretrain_gpt_base.yaml | 2 + ppfleetx/core/engine/eager_engine.py | 6 +- ppfleetx/distributed/apis/env.py | 17 +++-- .../gpt/dygraph/hybrid_model.py | 11 ++- .../gpt/dygraph/single_model.py | 13 ++-- .../models/language_model/language_module.py | 75 ++++++++----------- ppfleetx/utils/config.py | 2 +- projects/gpt/docs/README.md | 12 ++- 10 files changed, 85 insertions(+), 77 deletions(-) diff --git a/docs/standard.md b/docs/standard.md index 9aac894bd..334810fca 100644 --- a/docs/standard.md +++ b/docs/standard.md @@ -103,6 +103,8 @@ Engine: test_iters: mix_precision: enable: True + dtype: "float16" + level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -123,10 +125,12 @@ Engine: | logging_freq | 训练日志打印的频率 | | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | -| enable | 是否使用purefp16精度训练 | -| scale_loss | 使用fp16精度下,loss的放缩比例 | -| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16计算。 | -| custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16计算。| +| enable | 是否使用混合精度策略进行训练 | +| dtype | 混合精度训练数据类型使用float16还是bfloat16,默认为float16类型 | +| level | 混合精度训练模式,默认``O2``模式 | +| scale_loss | 使用fp16混合精度策略下,loss的放缩比例 | +| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 | +| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16/bfloat16计算 | | save_steps | 保存模型间隔 | | save_epoch | 保存模型epoch间隔 | | output_dir | 指定输出文件 | diff --git a/examples/transformer/models/GPT/docs/README.md b/examples/transformer/models/GPT/docs/README.md index a6d32886c..8eb94e27c 100644 --- a/examples/transformer/models/GPT/docs/README.md +++ b/examples/transformer/models/GPT/docs/README.md @@ -103,6 +103,8 @@ cd .. # 回到 GPT 目录下 test_iters: mix_precision: enable: True + dtype: "float16" + level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -128,10 +130,12 @@ cd .. # 回到 GPT 目录下 | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | | test_iters | 模型测试或推理时的轮数 | -| enable | 是否使用purefp16精度训练 | -| scale_loss | 使用fp16精度下,loss的放缩比例 | -| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16计算。 | -| custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16计算。| +| enable | 是否使用混合精度策略进行训练 | +| dtype | 混合精度训练数据类型使用float16还是bfloat16,默认为float16类型 | +| level | 混合精度训练模式,默认``O2``模式 | +| scale_loss | 使用fp16混合精度策略下,loss的放缩比例 | +| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 | +| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16/bfloat16计算 | | save_steps | 保存模型间隔step数 | | save_epoch | 保存模型间隔epoch数 | | output_dir | 指定输出文件 | diff --git a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml index d7f5c0ead..04da8a106 100644 --- a/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml +++ b/ppfleetx/configs/nlp/gpt/pretrain_gpt_base.yaml @@ -38,6 +38,7 @@ Model: scale_qk_by_layer_num: True sequence_parallel: False use_flash_attn: False + fused_softmax_with_triangular: True Data: @@ -84,6 +85,7 @@ Optimizer: warmup_rate: 0.01 max_lr: 5.0e-5 min_lr: 1.0e-5 + use_increments: True grad_clip: name: "ClipGradByGlobalNorm" clip_norm: 1.0 diff --git a/ppfleetx/core/engine/eager_engine.py b/ppfleetx/core/engine/eager_engine.py index 7dceafdb4..182c0a912 100644 --- a/ppfleetx/core/engine/eager_engine.py +++ b/ppfleetx/core/engine/eager_engine.py @@ -201,6 +201,8 @@ def configure_optimizers(self): self._scaler = None if mode == 'train': + self._use_increments = configs.Optimizer.lr.pop('use_increments', + False) self._lr_scheduler_mode = configs.Optimizer.lr.pop('run_mode', 'step') assert self._lr_scheduler_mode in [ @@ -350,9 +352,9 @@ def _train_one_epoch(self, train_losses.append(loss) if self._lr_scheduler is not None and self._lr_scheduler_mode == 'step': - # TODO: if update_successful if self._scaler is None or self._scaler._found_inf == 0: - self._lr_scheduler.step(epoch=self._global_batch_size) + self._lr_scheduler.step(epoch=self._global_batch_size + if self._use_increments else None) if (step + 1) % self._logging_freq == 0: train_step_cost = get_timestamp() - train_step_start diff --git a/ppfleetx/distributed/apis/env.py b/ppfleetx/distributed/apis/env.py index d9855632b..28c160123 100644 --- a/ppfleetx/distributed/apis/env.py +++ b/ppfleetx/distributed/apis/env.py @@ -42,14 +42,15 @@ def set_seed(seed): else: mp_rank, pp_rank, data_world_rank, data_world_size = 0, 0, 0, 1 - seed += 100 * pp_rank - random.seed(seed) - np.random.seed(seed) - paddle.seed(seed) - - # random.seed(seed + data_world_rank) - # np.random.seed(seed + data_world_rank) - # paddle.seed(seed + data_world_rank) + # NOTE: the commented seeds are set only for precision validation + # seed += 100 * pp_rank + # random.seed(seed) + # np.random.seed(seed) + # paddle.seed(seed) + + random.seed(seed + data_world_rank) + np.random.seed(seed + data_world_rank) + paddle.seed(seed + data_world_rank) # local_seed/ global_seed is used to control dropout in ModelParallel local_seed = seed + 123 + mp_rank * 10 + pp_rank * 1000 + data_world_size diff --git a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index 9f9be4098..d01cdced1 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -753,7 +753,8 @@ def __init__(self, no_recompute_layers=None, skip_tensor_map={}, freeze_embedding=False, - use_flash_attn=False): + use_flash_attn=False, + fused_softmax_with_triangular=False): super(GPTModelHybrid, self).__init__() @@ -762,6 +763,7 @@ def __init__(self, self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size + self.fused_softmax_with_triangular = fused_softmax_with_triangular if use_flash_attn: if flash_attention: @@ -847,12 +849,9 @@ def forward(self, embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids) - # fused_soiftmax_with_triangular is only suppported on GPU/DCU. - fused_softmax_with_triangular = strtobool( - os.getenv("fused_softmax_with_triangular", 'True')) # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. - if not fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( + if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( ): # TODO, use registered buffer causal_mask = paddle.tensor.triu( @@ -872,7 +871,7 @@ def forward(self, encoder_outputs = self.decoder( embedding_output, memory=None, - tgt_mask=None if (fused_softmax_with_triangular and + tgt_mask=None if (self.fused_softmax_with_triangular and self.training and paddle.is_compiled_with_cuda()) else attention_mask, # use softmax_mask_fuse_upper_triangle use_cache=use_cache, diff --git a/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/ppfleetx/models/language_model/gpt/dygraph/single_model.py index 89c26b325..2f2a0a7ce 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -626,7 +626,8 @@ def __init__(self, no_recompute_layers=None, skip_tensor_map={}, freeze_embedding=False, - use_flash_attn=False): + use_flash_attn=False, + fused_softmax_with_triangular=False): super(GPTModel, self).__init__() @@ -635,6 +636,7 @@ def __init__(self, self.initializer_range = initializer_range self.hidden_size = hidden_size self.vocab_size = vocab_size + self.fused_softmax_with_triangular = fused_softmax_with_triangular if use_flash_attn: if flash_attention: @@ -727,11 +729,9 @@ def forward(self, embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids) - fused_softmax_with_triangular = strtobool( - os.getenv("fused_softmax_with_triangular", 'True')) # fused_softmax_with_triangular is only suppported on GPU/DCU. # If on non-GPU devices, we use user defined mask and non-fused softmax. - if not fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( + if not self.fused_softmax_with_triangular or not paddle.is_compiled_with_cuda( ): # TODO, use registered buffer causal_mask = paddle.tensor.triu( @@ -751,7 +751,7 @@ def forward(self, encoder_outputs = self.decoder( embedding_output, memory=None, - tgt_mask=None if (fused_softmax_with_triangular and + tgt_mask=None if (self.fused_softmax_with_triangular and self.training and paddle.is_compiled_with_cuda()) else attention_mask, # use softmax_mask_fuse_upper_triangle use_cache=use_cache, @@ -1234,7 +1234,8 @@ def _post_process_(outputs, input_ids, cur_len, origin_len, scores, shape=[paddle.shape(probs)[0]], fill_value=top_p, dtype=probs.dtype) - _, next_tokens = topp_sampling(probs, top_ps_tensor, random_seed=100) + _, next_tokens = topp_sampling( + probs, top_ps_tensor, random_seed=100) else: probs = TopPProcess(probs, top_p, min_tokens_to_keep) diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index 66eb63528..53b259937 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -44,6 +44,32 @@ } +def get_model_size(l, h, v, s): + P = 0 + # embedding + P += (v + s) * h + # attention + P += (4 * h * h + 4 * h) * l + # layer_norm of decoder + P += (2 * (2 * h)) * l + # FFN Layer + P += (8 * h * h + 5 * h) * l + # layer_norm of transformer + P += 2 * h + logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / 1000.0)) + + +def vocab_size_with_padding(vocab_size, div_unit, mp_degree): + padded_size = vocab_size + multiple = div_unit * mp_degree + while (padded_size % multiple) != 0: + padded_size += 1 + logging.warning(' > padded vocab (size: {}) with {} dummy tokens ' + '(new size: {})'.format(vocab_size, padded_size - + vocab_size, padded_size)) + return padded_size + + class LanguageModule(BasicModule): def __init__(self, configs): self.nranks = paddle.distributed.get_world_size() @@ -114,31 +140,6 @@ def test_step_end(self, log_dict): % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['test_cost'], speed)) - def get_model_size(self, l, h, v, s): - P = 0 - # embedding - P += (v + s) * h - # attention - P += (4 * h * h + 4 * h) * l - # layer_norm of decoder - P += (2 * (2 * h)) * l - # FFN Layer - P += (8 * h * h + 5 * h) * l - # layer_norm of transformer - P += 2 * h - logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / - 1000.0)) - - def vocab_size_with_padding(self, vocab_size, div_unit, mp_degree): - padded_size = vocab_size - multiple = div_unit * mp_degree - while (padded_size % multiple) != 0: - padded_size += 1 - logging.warning(' > padded vocab (size: {}) with {} dummy tokens ' - '(new size: {})'.format(vocab_size, padded_size - - vocab_size, padded_size)) - return padded_size - def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) @@ -166,7 +167,7 @@ def get_model(self): tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) - model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit'), self.configs.Distributed.get('mp_degree', 1)) @@ -175,7 +176,7 @@ def get_model(self): h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = self.configs.Data.Train.dataset.max_seq_len - self.get_model_size(l, h, v, s) + get_model_size(l, h, v, s) if self.nranks == 1: model_setting.pop("sequence_parallel") @@ -273,7 +274,7 @@ def get_model(self): num_classes = model_setting.pop("num_classes", 2) assert pretrained is not None - model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit'), self.configs.Distributed.get('mp_degree', 1)) @@ -283,7 +284,7 @@ def get_model(self): v = model_setting['vocab_size'] num_heads = model_setting['num_attention_heads'] s = self.configs.Data.Train.dataset.max_length - self.get_model_size(l, h, v, s) + get_model_size(l, h, v, s) model_name = model_setting.pop("name") tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] @@ -456,11 +457,6 @@ def test_step_end(self, log_dict): % (log_dict['epoch'], log_dict['batch'], log_dict['loss'], log_dict['test_cost'], speed)) - def get_model_size(self, l, h, v, s): - P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h)) - logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / - 1000.0)) - def training_epoch_end(self, log_dict): logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict['epoch'], log_dict['train_cost'])) @@ -517,7 +513,7 @@ def get_model(self): tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) - model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit'), self.configs.Distributed.get('mp_degree', 1)) @@ -647,7 +643,7 @@ def get_model(self): tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) - model_setting['vocab_size'] = self.vocab_size_with_padding( + model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit'), self.configs.Distributed.get('mp_degree', 1)) @@ -753,7 +749,7 @@ def get_model(self): h = model_setting['hidden_size'] v = model_setting['vocab_size'] s = self.configs.Data.Train.dataset.max_seq_len - self.get_model_size(l, h, v, s) + get_model_size(l, h, v, s) if self.nranks == 1: model_setting.pop("sequence_parallel") @@ -777,11 +773,6 @@ def get_loss_fn(self): loss_fn = gpt.GPTPretrainingCriterionHybird() return loss_fn - def get_model_size(self, l, h, v, s): - P = 12 * l * h * h * (1 + 13 / (12 * h) + (v + s) / (12 * l * h)) - logger.info('Model Size: {:.2f} B'.format(P / 1000.0 / 1000.0 / - 1000.0)) - def training_step(self, batch): tokens, position_ids, labels, loss_mask = batch diff --git a/ppfleetx/utils/config.py b/ppfleetx/utils/config.py index 59c3821bd..f3d85a476 100644 --- a/ppfleetx/utils/config.py +++ b/ppfleetx/utils/config.py @@ -524,7 +524,7 @@ def process_auto_strategy(config): amp_cfg = config.Engine.get('mix_precision', {}) amp = strategy.amp amp.enable = amp_cfg.get('level', "") in ['o1', 'o2', 'o3'] - amp.enable = amp_cfg.get('level', "") in ['o2', 'o3'] + amp.use_pure_fp16 = amp_cfg.get('level', "") in ['o2', 'o3'] amp.use_optimizer_fp16 = amp_cfg.get('level', "") in ['o3'] amp.use_fp16_guard = amp_cfg.get('use_fp16_guard', False) amp.init_loss_scaling = amp_cfg.get('scale_loss', 32768) diff --git a/projects/gpt/docs/README.md b/projects/gpt/docs/README.md index 5c332171e..10b88e8e1 100644 --- a/projects/gpt/docs/README.md +++ b/projects/gpt/docs/README.md @@ -132,6 +132,8 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, test_iters: mix_precision: enable: True + dtype: "float16" + level: "O2" scale_loss: 32768.0 custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"] custom_white_list: ["lookup_table", "lookup_table_v2"] @@ -152,10 +154,12 @@ Engine训练设置完成模型训练/验证/推理等过程中的参数设置, | eval_freq | 模型评估间隔 | | eval_iters | 模型评估时训练评估测试集的轮数 | | test_iters | 模型测试或推理时的轮数 | -| enable | 是否使用purefp16精度训练 | -| scale_loss | 使用fp16精度下,loss的放缩比例 | -| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持float16计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16计算。 | -| custom_white_list | 自定义算子白名单。这个名单中的算子在支持float16计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16计算。| +| enable | 是否使用混合精度策略进行训练 | +| dtype | 混合精度训练数据类型使用float16还是bfloat16,默认为float16类型 | +| level | 混合精度训练模式,默认``O2``模式 | +| scale_loss | 使用fp16混合精度策略下,loss的放缩比例 | +| custom_black_list | 自定义算子黑名单。这个名单中的算子在支持混合精度计算时会被认为是数值危险的,它们的影响也可能会在下游操作中观察到。这些算子通常不会转为float16/bfloat16计算 | +| custom_white_list | 自定义算子白名单。这个名单中的算子在支持混合精度计算时会被认为是数值安全的,并且对性能至关重要。如果设置了白名单,该名单中的算子会使用float16/bfloat16计算 | | save_steps | 保存模型间隔step数 | | save_epoch | 保存模型间隔epoch数 | | output_dir | 指定输出文件 | From fa6fa48253fe358cfb5eceecbcbdf6020f96be23 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Wed, 15 Mar 2023 07:47:44 +0000 Subject: [PATCH 22/25] update --- ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index d01cdced1..076bad3b0 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -1072,7 +1072,8 @@ def __init__(self, sequence_parallel=False, no_recompute_layers=None, pp_recompute_interval=1, - use_flash_attn=False): + use_flash_attn=False, + fused_softmax_with_triangular=False): # forward desc self.descs = [] From 1874cb6b678a30c483bc30bf75807064c3834570 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Wed, 15 Mar 2023 08:39:54 +0000 Subject: [PATCH 23/25] update --- ppfleetx/models/language_model/language_module.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index 53b259937..c07546716 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -274,6 +274,10 @@ def get_model(self): num_classes = model_setting.pop("num_classes", 2) assert pretrained is not None + model_name = model_setting.pop("name") + tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] + self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) + model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), model_setting.pop('vocab_size_divisible_unit'), @@ -286,10 +290,6 @@ def get_model(self): s = self.configs.Data.Train.dataset.max_length get_model_size(l, h, v, s) - model_name = model_setting.pop("name") - tokenizer_class, pretrained_name = MODEL_CLASSES[model_name] - self.tokenizer = tokenizer_class.from_pretrained(pretrained_name) - if self.nranks == 1: model = gpt.GPTForSequenceClassification( gpt.GPTModel(**model_setting), num_classes) From 62f32a9d786a7efac2ec161292d5a2eec26c8ba6 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Wed, 15 Mar 2023 13:09:17 +0000 Subject: [PATCH 24/25] update --- ppfleetx/models/language_model/language_module.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ppfleetx/models/language_model/language_module.py b/ppfleetx/models/language_model/language_module.py index c07546716..c7c0b7233 100644 --- a/ppfleetx/models/language_model/language_module.py +++ b/ppfleetx/models/language_model/language_module.py @@ -169,7 +169,7 @@ def get_model(self): model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), - model_setting.pop('vocab_size_divisible_unit'), + model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) l = model_setting['num_layers'] @@ -280,7 +280,7 @@ def get_model(self): model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), - model_setting.pop('vocab_size_divisible_unit'), + model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) l = model_setting['num_layers'] @@ -515,7 +515,7 @@ def get_model(self): model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), - model_setting.pop('vocab_size_divisible_unit'), + model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) if self.nranks == 1: @@ -645,7 +645,7 @@ def get_model(self): model_setting['vocab_size'] = vocab_size_with_padding( model_setting.get('vocab_size', self.tokenizer.vocab_size), - model_setting.pop('vocab_size_divisible_unit'), + model_setting.pop('vocab_size_divisible_unit', 128), self.configs.Distributed.get('mp_degree', 1)) if self.nranks == 1: From 974e1d296f8c783d26a37286784c30f9fd5cc320 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Thu, 16 Mar 2023 01:46:05 +0000 Subject: [PATCH 25/25] update --- .../gpt/dygraph/hybrid_model.py | 19 +++++++++++++++++-- .../gpt/dygraph/single_model.py | 18 ++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py index 076bad3b0..27c4933e3 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/hybrid_model.py @@ -18,6 +18,7 @@ import logging from distutils.util import strtobool import os +import math import paddle import paddle.nn as nn @@ -105,6 +106,7 @@ def __init__(self, vdim=None, need_weights=False, weight_attr=None, + output_layer_weight_attr=None, bias_attr=None, fuse_attn_qkv=False, scale_qk_coeff=1.0, @@ -188,7 +190,7 @@ def __init__(self, embed_dim, embed_dim, mp_group=env.get_hcg().get_model_parallel_group(), - weight_attr=weight_attr, + weight_attr=output_layer_weight_attr, has_bias=True, input_is_parallel=True, fuse_matmul_bias=fused_linear) @@ -501,6 +503,7 @@ def __init__(self, act_dropout=None, normalize_before=True, weight_attr=None, + output_layer_weight_attr=None, bias_attr=None, num_partitions=1, fused_linear=False, @@ -544,6 +547,8 @@ def __init__(self, weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) + output_layer_weight_attrs = _convert_param_attr_to_list( + output_layer_weight_attr, 3) self.self_attn = MultiHeadAttention( d_model, @@ -551,6 +556,7 @@ def __init__(self, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], + output_layer_weight_attr=output_layer_weight_attrs[0], num_partitions=num_partitions, fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv, @@ -593,7 +599,7 @@ def __init__(self, dim_feedforward, d_model, mp_group=env.get_hcg().get_model_parallel_group(), - weight_attr=weight_attrs[2], + weight_attr=output_layer_weight_attrs[2], input_is_parallel=True, has_bias=True, fuse_matmul_bias=fused_linear) @@ -801,6 +807,11 @@ def __init__(self, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range)), + output_layer_weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Normal( + mean=0.0, + std=self.initializer_range / math.sqrt( + 2.0 * num_layers))), bias_attr=None, num_partitions=num_partitions, fused_linear=fused_linear, @@ -1128,6 +1139,10 @@ def __init__(self, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=initializer_range)), + output_layer_weight_attr=paddle. + ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, + std=initializer_range / math.sqrt(2.0 * num_layers))), bias_attr=None, num_partitions=num_partitions, moe_configs=moe_configs, diff --git a/ppfleetx/models/language_model/gpt/dygraph/single_model.py b/ppfleetx/models/language_model/gpt/dygraph/single_model.py index 2f2a0a7ce..6b9b16f6e 100644 --- a/ppfleetx/models/language_model/gpt/dygraph/single_model.py +++ b/ppfleetx/models/language_model/gpt/dygraph/single_model.py @@ -19,6 +19,7 @@ from distutils.util import strtobool import os import numpy as np +import math import paddle import paddle.nn as nn @@ -99,6 +100,7 @@ def __init__(self, need_weights=False, weight_attr=None, bias_attr=None, + output_layer_weight_attr=None, fuse_attn_qkv=False, scale_qk_coeff=1.0, fused_linear=False, @@ -140,7 +142,10 @@ def __init__(self, self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) self.out_proj = Linear( - embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) + embed_dim, + embed_dim, + output_layer_weight_attr, + bias_attr=bias_attr) def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): mix_layer = self.qkv_proj(query) @@ -424,6 +429,7 @@ def __init__(self, enable_expert_tensor_parallelism=False, weight_attr=None, bias_attr=None, + output_layer_weight_attr=None, fused_linear=False, fuse_attn_qkv=False, scale_qk_coeff=1.0, @@ -448,6 +454,8 @@ def __init__(self, weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) + output_layer_weight_attrs = _convert_param_attr_to_list( + output_layer_weight_attr, 3) Linear = FusedLinear if fused_linear else nn.Linear @@ -457,6 +465,7 @@ def __init__(self, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0], + output_layer_weight_attr=output_layer_weight_attrs[0], fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv, scale_qk_coeff=scale_qk_coeff, @@ -490,7 +499,7 @@ def __init__(self, self.linear2 = Linear( dim_feedforward, d_model, - weight_attrs[2], + output_layer_weight_attrs[2], bias_attr=bias_attrs[2]) if 'linear1' in skip_quant_tensors: @@ -686,6 +695,11 @@ def __init__(self, weight_attr=paddle.ParamAttr( initializer=nn.initializer.Normal( mean=0.0, std=self.initializer_range)), + output_layer_weight_attr=paddle.ParamAttr( + initializer=nn.initializer.Normal( + mean=0.0, + std=self.initializer_range / math.sqrt( + 2.0 * num_layers))), bias_attr=None, fused_linear=fused_linear, fuse_attn_qkv=fuse_attn_qkv,