PaddlePaddle · haohongxiang · Dec 19, 2022 · Dec 19, 2022
diff --git a/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_345M_single_card.yaml b/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_345M_single_card.yaml
@@ -0,0 +1,10 @@
+_base_: ./eval_gpt_base.yaml
+
+
+Offline_Eval:
+  eval_path: ./wikitext-103/wiki.valid.tokens
+  cloze_eval: False
+  overlapping_eval: 32
+  batch_size: 8
+  max_seq_len: 1024
+  logging_freq: 10
diff --git a/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml b/examples/transformer/models/GPT/offline-eval/configs/eval_gpt_base.yaml
@@ -0,0 +1,91 @@
+Global:
+  device: gpu
+  seed: 1024
+
+  global_batch_size: 
+  local_batch_size: 8
+  micro_batch_size: 8
+
+  max_steps: 500000
+  num_train_epochs: 1
+  accumulate_steps:
+  logging_freq: 1
+  eval_freq: 500
+  eval_iters: 10
+  test_iters:
+  mix_precision:
+    use_pure_fp16: True
+    scale_loss: 32768.0
+    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
+    custom_white_list: ["lookup_table", "lookup_table_v2"]
+  save_load:
+    save_steps: 1000
+    save_epoch: 1
+    output_dir: ./output
+    ckpt_dir:
+
+
+Model:
+  name: "GPT"
+  vocab_size: 50304
+  hidden_size: 1024
+  num_layers: 24
+  num_attention_heads: 16
+  ffn_hidden_size: 4096
+  hidden_dropout_prob: 0.1
+  attention_probs_dropout_prob: 0.1
+  max_position_embeddings: 1024
+  type_vocab_size: 16
+  initializer_range: 0.02
+  use_recompute: False
+  recompute_granularity:
+  no_recompute_layers:
+  fused_linear: False
+  fuse_attn_qkv: True
+  sequence_parallel: False
+
+
+Data:
+  Eval:
+    dataset:
+      name: LM_Eval_Dataset
+      input_dir: ./data/
+      split: [949, 50, 1]
+      max_seq_len: 1024
+      overlapping_eval: 
+    sampler:
+      name: GPTBatchSampler
+      shuffle: False
+      drop_last: True
+    loader:
+      num_workers: 1
+      return_list: False
+      collate_fn: gpt_collate_fn
+
+
+Distributed:
+  dp_degree: 1
+  mp_degree: 1
+  pp_degree: 1
+  sharding:
+    sharding_degree: 1
+    sharding_stage: 1
+    sharding_offload: False
+    reduce_overlap: False
+    broadcast_overlap: False
+
+
+Profiler:
+  enable: False
+  scheduler: [1, 5]
+  profiler_log: profiler_log
+  detailed: False
+
+
+Offline_Eval:
+  eval_path: ./wikitext-103/wiki.valid.tokens
+  cloze_eval: False
+  overlapping_eval: 32
+  batch_size: 8
+  max_seq_len: 1024
+  logging_freq: 10
diff --git a/examples/transformer/models/GPT/offline-eval/configs/eval_pruned_gpt_345M_single_card.yaml b/examples/transformer/models/GPT/offline-eval/configs/eval_pruned_gpt_345M_single_card.yaml
@@ -0,0 +1,22 @@
+_base_: ./eval_gpt_base.yaml
+
+
+Model:
+  hidden_dropout_prob: 0.0
+  attention_probs_dropout_prob: 0.0
+
+
+Compress:
+  Prune:
+    enable: True
+    criterion: l1_norm
+    ratio: 0.125
+
+
+Offline_Eval:
+  eval_path: ./lambada_test.jsonl
+  cloze_eval: True
+  overlapping_eval: 32
+  batch_size: 8
+  max_seq_len: 1024
+  logging_freq: 10
diff --git a/examples/transformer/models/GPT/offline-eval/configs/eval_qat_gpt_345M_single_card.yaml b/examples/transformer/models/GPT/offline-eval/configs/eval_qat_gpt_345M_single_card.yaml
@@ -0,0 +1,23 @@
+_base_: ./eval_gpt_base.yaml
+
+
+Compress:
+  pretrained:
+  Quantization:
+    enable: True
+    weight_quantize_type: 'abs_max'
+    activation_quantize_type: 'moving_average_abs_max'
+    activation_preprocess_type: 'PACT'
+    weight_bits: 8
+    activation_bits: 8
+    quantizable_layer_type: ['Linear', 'ColumnParallelLinear', 'RowParallelLinear']
+    onnx_format: True
+
+
+Offline_Eval:
+  eval_path: ./wikitext-103/wiki.valid.tokens
+  cloze_eval: False
+  overlapping_eval: 32
+  batch_size: 8
+  max_seq_len: 1024
+  logging_freq: 10
diff --git a/examples/transformer/models/GPT/offline-eval/impls.py b/examples/transformer/models/GPT/offline-eval/impls.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import copy
+import numpy as np
+import json
+import re
+import math
+
+import paddle
+import paddle.distributed as dist
+from ppfleetx.utils.log import logger
+from ppfleetx.distributed.apis import env
+from ppfleetx.models.language_model import gpt
+from ppfleetx.data.tokenizers import GPTTokenizer, GPTChineseTokenizer
+
+MODEL_CLASSES = {
+    "GPT": (GPTTokenizer, "gpt2"),
+    "GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
+}
+
+
+def build_model(config):
+    nranks = dist.get_world_size()
+    model_setting = copy.deepcopy(config.Model)
+
+    if 'Compress' in config and 'Quantization' in config.Compress:
+        quant_setting = copy.deepcopy(config.Compress.Quantization)
+        model_setting['skip_tensor_map'] = quant_setting.get('skip_tensor_map',
+                                                             {})
+        model_setting['freeze_embedding'] = quant_setting.get(
+            'freeze_embedding', False)
+
+    model_name = model_setting.pop("name")
+    tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
+    tokenizer = tokenizer_class.from_pretrained(pretrained_name)
+
+    if nranks == 1:
+        model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
+    else:
+        raise RuntimeError(
+            "Only single-card offline eval is supported in GPTModel now.")
+
+    return model, tokenizer
+
+
+@paddle.no_grad()
+def eval_impl(config, batch, model):
+    model.eval()
+
+    use_fp16 = config.Global.mix_precision.use_pure_fp16
+    black_list = config.Global.mix_precision.custom_black_list
+    white_list = config.Global.mix_precision.custom_white_list
+
+    with paddle.amp.auto_cast(
+            use_fp16,
+            custom_black_list=black_list,
+            custom_white_list=white_list,
+            level='O2'):
+
+        tokens, loss_mask, attention_mask, position_ids, labels = batch
+        preds = model(tokens, position_ids, attention_mask)
+
+        if not config.Offline_Eval.cloze_eval:
+            masked_lm_loss = paddle.nn.functional.cross_entropy(
+                preds, labels, reduction="none")
+            loss = paddle.sum(masked_lm_loss * loss_mask)
+
+            return loss
+        else:
+            outputs = paddle.argmax(preds, -1)
+            acc = paddle.cast(outputs == labels, 'float32')
+            acc = paddle.where(
+                paddle.cast(loss_mask, 'bool'), acc, paddle.ones_like(acc))
+            acc = paddle.sum(paddle.prod(acc, -1))
+
+            return acc
+
+
+class LM_Eval_Dataset(paddle.io.Dataset):
+    def __init__(self,
+                 tokens,
+                 max_seq_len,
+                 eos_token_id,
+                 overlapping_eval=None,
+                 **kwargs):
+        self.tokens = tokens
+        self.seq_len = max_seq_len
+        self.pad_idx = eos_token_id
+        self.overlapping_eval = overlapping_eval
+        if self.overlapping_eval is None:
+            self.overlapping_eval = self.seq_len
+        self.overlapping_eval = max(1, self.overlapping_eval)
+
+        self.total_targets = len(self.tokens) - 1
+        # remove first sequence tokens
+        targets = max(self.total_targets - self.overlapping_eval, 0)
+        self.total_sequences = max(
+            math.ceil(targets / self.overlapping_eval) + 1, 1)
+
+    def __len__(self):
+        return self.total_sequences
+
+    def _construct_sample(self, tokens):
+        tokens = np.array(tokens).astype("int64").tolist()
+        labels = tokens[1:]
+        tokens = tokens[:-1]
+        seq_length = len(tokens)
+        # attention mask for the attention calulate
+        attention_mask = np.tri(seq_length, seq_length).reshape(
+            (1, seq_length, seq_length))
+
+        # the pad and eos tokens do not contribute the loss
+        loss_mask = np.ones(seq_length, dtype="float32")
+        loss_mask[np.where(np.array(tokens) == self.pad_idx)] = 0.0
+        position_ids = np.arange(0, seq_length, dtype="int64")
+
+        # -INF mask value as default
+        # attention_mask = (attention_mask - 1.0) * 1e9
+        # Bool mask of attention
+        attention_mask = attention_mask.astype("float32")
+        return [tokens, loss_mask, attention_mask, position_ids, labels]
+
+    def __getitem__(self, idx):
+        start_idx = idx * self.overlapping_eval
+        end_idx = start_idx + self.seq_len
+        tokens = self.tokens[start_idx:end_idx + 1]
+        num_tokens = len(tokens)
+        if num_tokens < self.seq_len + 1:
+            num_pad = (self.seq_len + 1 - num_tokens)
+            tokens += [self.pad_idx] * num_pad
+        [tokens, loss_mask, attention_mask, position_ids,
+         labels] = self._construct_sample(tokens)
+        if self.overlapping_eval != self.seq_len and idx != 0:
+            loss_mask[:-self.overlapping_eval] *= 0
+
+        return [tokens, loss_mask, attention_mask, position_ids, labels]
+
+
+class Lambada_Eval_Dataset(paddle.io.Dataset):
+    def __init__(self, tokens, labels, max_seq_len, eos_token_id, **kwargs):
+        self.pad_idx = eos_token_id
+        self.seq_len = max_seq_len
+        self.tokens = tokens
+        self.labels = labels
+
+    def __len__(self):
+        return len(self.tokens)
+
+    def _construct_sample(self, tokens):
+        tokens = np.array(tokens).astype("int64").tolist()
+        labels = tokens[1:]
+        tokens = tokens[:-1]
+
+        seq_length = len(tokens)
+        # attention mask for the attention calulate
+        attention_mask = np.tri(seq_length, seq_length).reshape(
+            (1, seq_length, seq_length))
+
+        # the pad and eos tokens do not contribute the loss
+        position_ids = np.arange(0, seq_length, dtype="int64")
+
+        # -INF mask value as default
+        #attention_mask = (attention_mask - 1.0) * 1e9
+        # Bool mask of attention
+        attention_mask = attention_mask.astype("float32")
+        return [tokens, attention_mask, position_ids, labels]
+
+    def __getitem__(self, idx):
+        tokens = self.tokens[idx][:self.seq_len]
+        labels = self.labels[idx]
+        tokens = tokens + labels
+        num_tokens = len(tokens)
+        if num_tokens < self.seq_len + 1:
+            num_pad = (self.seq_len + 1 - num_tokens)
+            tokens += [self.pad_idx] * num_pad
+        loss_mask = np.zeros(self.seq_len, dtype="float32")
+        loss_mask[num_tokens - len(labels) - 1:num_tokens - 1] = 1.
+        [tokens, attention_mask, position_ids,
+         labels] = self._construct_sample(tokens)
+        return [tokens, loss_mask, attention_mask, position_ids, labels]
+
+
+def wikitext_detokenizer(string):
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+
+    # number separators
+    string = string.replace(" @-@ ", "-")
+    string = string.replace(" @,@ ", ",")
+    string = string.replace(" @.@ ", ".")
+
+    # punctuation
+    string = string.replace(" : ", ": ")
+    string = string.replace(" ; ", "; ")
+    string = string.replace(" . ", ". ")
+    string = string.replace(" ! ", "! ")
+    string = string.replace(" ? ", "? ")
+    string = string.replace(" , ", ", ")
+
+    # double brackets
+    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
+    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
+    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
+    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
+    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
+
+    # miscellaneous
+    string = string.replace("= = = =", "====")
+    string = string.replace("= = =", "===")
+    string = string.replace("= =", "==")
+    string = string.replace(" " + chr(176) + " ", chr(176))
+    string = string.replace(" \n", "\n")
+    string = string.replace("\n ", "\n")
+    string = string.replace(" N ", " 1 ")
+    string = string.replace(" 's", "'s")
+
+    return string
+
+
+def get_tokens(tokenizer, text, strict=True):
+    if not strict:
+        tokens = tokenizer.encode(text)
+        return tokens[:-1], [tokens[-1]]
+    last_token = text.split()[-1]
+    start_idx = text.rfind(last_token)
+    beginning_tokens = tokenizer.encode(text[:start_idx].strip())
+    last_token = tokenizer.encode(' ' + last_token)
+    return beginning_tokens, last_token