PaddlePaddle
diff --git a/‎model_zoo/ernie-m/run_classifier.py
Lines changed: 43 additions & 37 deletions b/‎model_zoo/ernie-m/run_classifier.py
Lines changed: 43 additions & 37 deletions
@@ -26,11 +26,11 @@
 from paddle.io import Dataset, BatchSampler, DistributedBatchSampler, DataLoader
 from paddlenlp.transformers import AutoModelForSequenceClassification, AutoTokenizer
 from paddlenlp.transformers import LinearDecayWithWarmup
-from paddlenlp.datasets import load_dataset
-from paddlenlp.data import Stack, Tuple, Pad
+from datasets import load_dataset
 from paddle.metric import Accuracy
 from paddlenlp.ops.optimizer import layerwise_lr_decay
 from paddle.optimizer import AdamW
+from paddlenlp.data import DataCollatorWithPadding
 
 all_languages = [
     "ar", "bg", "de", "el", "en", "es", "fr", "hi", "ru", "sw", "th", "tr",
@@ -137,6 +137,9 @@ def parse_args():
         type=str,
         choices=["cpu", "gpu", "xpu"],
         help="The device to select to train the model, is must be cpu/gpu/xpu.")
+    parser.add_argument("--overwrite_cache",
+                        action="store_true",
+                        help="Whether to overwrite cache for dataset.")
     parser.add_argument("--use_amp",
                         type=distutils.util.strtobool,
                         default=False,
@@ -164,8 +167,8 @@ def evaluate(model, loss_fct, metric, data_loader, language):
     model.eval()
     metric.reset()
     for batch in data_loader:
-        input_ids, position_ids, attention_mask, labels = batch
-        logits = model(input_ids, position_ids, attention_mask)
+        labels = batch.pop("labels")
+        logits = model(**batch)
         loss = loss_fct(logits, labels)
         correct = metric.compute(logits, labels)
         metric.update(correct)
@@ -178,21 +181,25 @@ def evaluate(model, loss_fct, metric, data_loader, language):
 
 def convert_example(example, tokenizer, max_seq_length=256):
     """convert a example into necessary features"""
-    # Get the label
-    label = example["label"]
-    premise = example["premise"]
-    hypothesis = example["hypothesis"]
     # Convert raw text to feature
-    example = tokenizer(premise,
-                        text_pair=hypothesis,
-                        max_seq_len=max_seq_length)
-    return example["input_ids"], example["position_ids"], example[
-        "attention_mask"], label
-
-
-def get_test_dataloader(args, language, batchify_fn, trans_func):
-    test_ds = load_dataset("xnli", language, splits="test")
-    test_ds = test_ds.map(trans_func, lazy=True)
+    tokenized_example = tokenizer(example["premise"],
+                                  text_pair=example["hypothesis"],
+                                  max_length=max_seq_length,
+                                  padding=False,
+                                  truncation=True,
+                                  return_position_ids=True,
+                                  return_attention_mask=True,
+                                  return_token_type_ids=False)
+    return tokenized_example
+
+
+def get_test_dataloader(args, language, batchify_fn, trans_func,
+                        remove_columns):
+    test_ds = load_dataset("xnli", language, split="test")
+    test_ds = test_ds.map(trans_func,
+                          batched=True,
+                          remove_columns=remove_columns,
+                          load_from_cache_file=not args.overwrite_cache)
     test_batch_sampler = BatchSampler(test_ds,
                                       batch_size=args.batch_size,
                                       shuffle=False)
@@ -220,11 +227,7 @@ def __getitem__(self, idx):
         last = language_idx - 1 if language_idx > 0 else language_idx
         sample_idx = idx - self.cumsum_len[last] if idx >= self.cumsum_len[
             last] else idx
-        input_ids = self.datasets[language_idx][sample_idx][0]
-        position_ids = self.datasets[language_idx][sample_idx][1]
-        attention_mask = self.datasets[language_idx][sample_idx][2]
-        label = self.datasets[language_idx][sample_idx][3]
-        return input_ids, position_ids, attention_mask, label
+        return self.datasets[int(language_idx)][int(sample_idx)]
 
     def __len__(self):
         return self.cumsum_len[-1]
@@ -240,25 +243,28 @@ def do_train(args):
     trans_func = partial(convert_example,
                          tokenizer=tokenizer,
                          max_seq_length=args.max_seq_length)
+    remove_columns = ["premise", "hypothesis"]
     if args.task_type == "cross-lingual-transfer":
-        train_ds = load_dataset("xnli", "en", splits="train")
-        train_ds = train_ds.map(trans_func, lazy=True)
+        train_ds = load_dataset("xnli", "en", split="train")
+        train_ds = train_ds.map(trans_func,
+                                batched=True,
+                                remove_columns=remove_columns,
+                                load_from_cache_file=not args.overwrite_cache)
     elif args.task_type == "translate-train-all":
         all_train_ds = []
         for language in all_languages:
-            train_ds = load_dataset("xnli", language, splits="train")
-            all_train_ds.append(train_ds.map(trans_func, lazy=True))
+            train_ds = load_dataset("xnli", language, split="train")
+            all_train_ds.append(
+                train_ds.map(trans_func,
+                             batched=True,
+                             remove_columns=remove_columns,
+                             load_from_cache_file=not args.overwrite_cache))
         train_ds = XnliDataset(all_train_ds)
     train_batch_sampler = DistributedBatchSampler(train_ds,
                                                   batch_size=args.batch_size,
                                                   shuffle=True)
-    batchify_fn = lambda samples, fn=Tuple(
-        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"),  # input_ids
-        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
-            ),  # position_ids
-        Pad(axis=0, pad_val=0, dtype="int64"),  # attention_mask
-        Stack(dtype="int64")  # labels
-    ): fn(samples)
+    batchify_fn = DataCollatorWithPadding(tokenizer)
+
     train_data_loader = DataLoader(dataset=train_ds,
                                    batch_sampler=train_batch_sampler,
                                    collate_fn=batchify_fn,
@@ -318,11 +324,11 @@ def do_train(args):
     for epoch in range(num_train_epochs):
         for step, batch in enumerate(train_data_loader):
             global_step += 1
-            input_ids, position_ids, attention_mask, labels = batch
+            labels = batch.pop("labels")
             with paddle.amp.auto_cast(
                     args.use_amp,
                     custom_white_list=["layer_norm", "softmax", "gelu"]):
-                logits = model(input_ids, position_ids, attention_mask)
+                logits = model(**batch)
                 loss = loss_fct(logits, labels)
             if args.use_amp:
                 scaled_loss = scaler.scale(loss)
@@ -344,7 +350,7 @@ def do_train(args):
                 for language in all_languages:
                     tic_eval = time.time()
                     test_data_loader = get_test_dataloader(
-                        args, language, batchify_fn, trans_func)
+                        args, language, batchify_fn, trans_func, remove_columns)
                     evaluate(model, loss_fct, metric, test_data_loader,
                              language)
                     print("eval done total : %s s" % (time.time() - tic_eval))