Fix datas to data (#2410)

yeliang2258 · web-flow · commit 75edf33e3ba6 · 2022-06-06T21:10:45.000+08:00
* fix datas to data

* update code

* change all_data to texts in triton deploy
diff --git a/applications/neural_search/recall/domain_adaptive_pretraining/data_tools/dataset_utils.py b/applications/neural_search/recall/domain_adaptive_pretraining/data_tools/dataset_utils.py
@@ -52,10 +52,10 @@ def __init__(self, path, skip_warmup=False):
 
         self._token_ids = np.load(
             path + "_ids.npy", mmap_mode="r", allow_pickle=True)
-        process_datas = np.load(path + "_idx.npz")
-        self._sizes = process_datas["lens"]
-        self._pointers = process_datas["sents"]
-        self._doc_idx = process_datas["docs"]
+        process_data = np.load(path + "_idx.npz")
+        self._sizes = process_data["lens"]
+        self._pointers = process_data["sents"]
+        self._doc_idx = process_data["docs"]
 
     def __getstate__(self):
         return self._path
diff --git a/examples/information_extraction/DuEE/duee_1_postprocess.py b/examples/information_extraction/DuEE/duee_1_postprocess.py
@@ -24,16 +24,16 @@
 def predict_data_process(trigger_file, role_file, schema_file, save_path):
     """predict_data_process"""
     pred_ret = []
-    trigger_datas = read_by_lines(trigger_file)
+    trigger_data = read_by_lines(trigger_file)
     role_data = read_by_lines(role_file)
-    schema_datas = read_by_lines(schema_file)
+    schema_data = read_by_lines(schema_file)
     print("trigger predict {} load from {}".format(
-        len(trigger_datas), trigger_file))
+        len(trigger_data), trigger_file))
     print("role predict {} load from {}".format(len(role_data), role_file))
-    print("schema {} load from {}".format(len(schema_datas), schema_file))
+    print("schema {} load from {}".format(len(schema_data), schema_file))
 
     schema = {}
-    for s in schema_datas:
+    for s in schema_data:
         d_json = json.loads(s)
         schema[d_json["event_type"]] = [r["role"] for r in d_json["role_list"]]
 
@@ -50,7 +50,7 @@ def predict_data_process(trigger_file, role_file, schema_file, save_path):
             role_ret[role_type].append("".join(r["text"]))
         sent_role_mapping[d_json["id"]] = role_ret
 
-    for d in trigger_datas:
+    for d in trigger_data:
         d_json = json.loads(d)
         t_ret = extract_result(d_json["text"], d_json["pred"]["labels"])
         pred_event_types = list(set([t["type"] for t in t_ret]))
diff --git a/examples/language_model/chinesebert/utils.py b/examples/language_model/chinesebert/utils.py
@@ -210,7 +210,7 @@ def load_ds(datafiles):
         MapDataset
     '''
 
-    datas = []
+    data = []
 
     def read(ds_file):
         with open(ds_file, 'r', encoding='utf-8') as fp:
@@ -229,7 +229,7 @@ def read(ds_file):
 
 
 def load_ds_xnli(datafiles):
-    datas = []
+    data = []
 
     def read(ds_file):
         with open(ds_file, 'r', encoding='utf-8') as fp:
diff --git a/examples/language_model/elmo/elmo.py b/examples/language_model/elmo/elmo.py
@@ -338,10 +338,10 @@ def encode(self, sentences: List[List[str]]):
         Each sentence is a list of tokens without <s> or </s>, e.g.
         [['The', 'first', 'sentence', '.'], ['Second', '.']]
         """
-        batch_datas = create_batches(sentences, self._batch_size, self._vocab,
-                                     self._max_seq_len)
+        batch_data = create_batches(sentences, self._batch_size, self._vocab,
+                                    self._max_seq_len)
         embeddings = []
-        for data in batch_datas:
+        for data in batch_data:
             ids, ids_reverse, seq_lens = data
             # [batch_size, num_lstm_layers + 1, max_seq_len, projection_dim * 2]
             outputs = self._elmo([ids, ids_reverse])
diff --git a/examples/language_model/gpt-3/dygraph/dataset.py b/examples/language_model/gpt-3/dygraph/dataset.py
@@ -301,10 +301,10 @@ def create_pretrained_dataset(args,
         logger.warning(
             "You are using compatible dataset, please make new dataset as the readme!"
         )
-        process_datas = np.load(
+        process_data = np.load(
             input_prefix + "_ids.npz", mmap_mode="r+", allow_pickle=True)
-        sample_ids = process_datas["ids"]
-        sample_lens = process_datas["lens"].astype("int32")
+        sample_ids = process_data["ids"]
+        sample_lens = process_data["lens"].astype("int32")
     else:
         for suffix in ["_ids.npy", "_idx.npz"]:
             if not os.path.isfile(input_prefix + suffix):
@@ -314,10 +314,10 @@ def create_pretrained_dataset(args,
             input_prefix + "_ids.npy", mmap_mode="r", allow_pickle=True)
         # All documment ids, extend as 1-D array.
 
-        process_datas = np.load(input_prefix + "_idx.npz")
+        process_data = np.load(input_prefix + "_idx.npz")
         # The len(sample_lens) num of docs
         # The sum(sample_lens) should equal len(sample_ids)
-        sample_lens = process_datas["lens"]
+        sample_lens = process_data["lens"]
 
     splits = get_train_valid_test_split_(args.split, len(sample_lens))
     assert len(sample_lens) >= splits[
diff --git a/examples/language_model/gpt-3/static/dataset.py b/examples/language_model/gpt-3/static/dataset.py
@@ -302,10 +302,10 @@ def create_pretrained_dataset(args,
         logger.warning(
             "You are using compatible dataset, please make new dataset as the readme!"
         )
-        process_datas = np.load(
+        process_data = np.load(
             input_prefix + "_ids.npz", mmap_mode="r+", allow_pickle=True)
-        sample_ids = process_datas["ids"]
-        sample_lens = process_datas["lens"].astype("int32")
+        sample_ids = process_data["ids"]
+        sample_lens = process_data["lens"].astype("int32")
     else:
         for suffix in ["_ids.npy", "_idx.npz"]:
             if not os.path.isfile(input_prefix + suffix):
@@ -315,10 +315,10 @@ def create_pretrained_dataset(args,
             input_prefix + "_ids.npy", mmap_mode="r", allow_pickle=True)
         # All documment ids, extend as 1-D array.
 
-        process_datas = np.load(input_prefix + "_idx.npz")
+        process_data = np.load(input_prefix + "_idx.npz")
         # The len(sample_lens) num of docs
         # The sum(sample_lens) should equal len(sample_ids)
-        sample_lens = process_datas["lens"]
+        sample_lens = process_data["lens"]
 
     splits = get_train_valid_test_split_(args.split, len(sample_lens))
     assert len(sample_lens) >= splits[
diff --git a/examples/language_model/moe/dygraph/dataset.py b/examples/language_model/moe/dygraph/dataset.py
@@ -248,12 +248,12 @@ def create_pretrained_dataset(args,
         "The distributed run, total device num:{}, distinct dataflow num:{}.".
         format(device_world_size, data_world_size))
 
-    process_datas = np.load(input_path, mmap_mode="r+", allow_pickle=True)
+    process_data = np.load(input_path, mmap_mode="r+", allow_pickle=True)
     # All documment ids, extend as 1-D array.
-    sample_ids = process_datas["ids"]
+    sample_ids = process_data["ids"]
     # The len(sample_lens) num of docs
     # The sum(sample_lens) should equal len(sample_ids)
-    sample_lens = process_datas["lens"]
+    sample_lens = process_data["lens"]
 
     splits = get_train_valid_test_split_(args.split, len(sample_lens))
     assert len(sample_lens) >= splits[
diff --git a/examples/text_graph/erniesage/data/graph_reader.py b/examples/text_graph/erniesage/data/graph_reader.py
@@ -54,7 +54,7 @@ def construct(tensors):
             """
             graph_num = 1
             start_len = 0
-            datas = []
+            data = []
             graph_list = []
             for graph in range(graph_num):
                 graph_list.append(
@@ -64,7 +64,7 @@ def construct(tensors):
                 start_len += 2
 
             for i in range(start_len, len(tensors)):
-                datas.append(tensors[i])
-            return graph_list, datas
+                data.append(tensors[i])
+            return graph_list, data
 
         return construct
diff --git a/examples/text_graph/erniesage/models/model.py b/examples/text_graph/erniesage/models/model.py
@@ -46,17 +46,17 @@ def __init__(self, ernie, config):
         self.encoder = Encoder.factory(self.config_file, self.ernie)
         self.loss_func = LossFactory(self.config_file)
 
-    def forward(self, graphs, datas):
+    def forward(self, graphs, data):
         """Forward function of link prediction task.
 
         Args:
             graphs (Graph List): the Graph list.
-            datas (Tensor List): other input of the model.
+            data (Tensor List): other input of the model.
 
         Returns:
             Tensor: loss and output tensors.
         """
-        term_ids, user_index, pos_item_index, neg_item_index, user_real_index, pos_item_real_index = datas
+        term_ids, user_index, pos_item_index, neg_item_index, user_real_index, pos_item_real_index = data
         # encoder model
         outputs = self.encoder(graphs, term_ids,
                                [user_index, pos_item_index, neg_item_index])
diff --git a/model_zoo/electra/deploy/python/predict.py b/model_zoo/electra/deploy/python/predict.py
@@ -157,7 +157,7 @@ def predict(args, sentences=[], paths=[]):
     predictor = create_paddle_predictor(config)
 
     start_time = time.time()
-    output_datas = []
+    output_data = []
     count = 0
     for i, sen in enumerate(predicted_input):
         sen = np.array(sen).astype("int64")
@@ -176,9 +176,9 @@ def predict(args, sentences=[], paths=[]):
         output_names = predictor.get_output_names()
         # get output pointer and copy data(nd.array)
         output_tensor = predictor.get_output_tensor(output_names[0])
-        output_data = output_tensor.copy_to_cpu()
-        output_res = np.argmax(output_data, axis=1).tolist()
-        output_datas.append(output_res)
+        predict_data = output_tensor.copy_to_cpu()
+        output_res = np.argmax(predict_data, axis=1).tolist()
+        output_data.append(output_res)
 
         print("===== batch {} =====".format(i))
         for j in range(len(predicted_sens[i])):
diff --git a/model_zoo/electra/deploy/serving/client.py b/model_zoo/electra/deploy/serving/client.py
@@ -146,7 +146,6 @@ def predict(args, sentences=[], paths=[]):
         predicted_data, tokenizer, args.max_seq_length, args.batch_size)
 
     start_time = time.time()
-    output_datas = []
     count = 0
     for i, sen in enumerate(predicted_input):
         sen = np.array(sen).astype("int64")
diff --git a/model_zoo/ernie-1.0/data_tools/dataset_utils.py b/model_zoo/ernie-1.0/data_tools/dataset_utils.py
@@ -158,12 +158,12 @@ def __init__(self, path, skip_warmup=False):
 
         self._token_ids = np.load(
             path + "_ids.npy", mmap_mode="r", allow_pickle=True)
-        process_datas = np.load(path + "_idx.npz")
-        self._sizes = process_datas["lens"]
+        process_data = np.load(path + "_idx.npz")
+        self._sizes = process_data["lens"]
         self._pointers = np.empty(len(self._sizes) + 1, dtype=np.int64)
         self._pointers[0] = 0
         np.cumsum(self._sizes, out=self._pointers[1:])
-        self._doc_idx = process_datas["docs"]
+        self._doc_idx = process_data["docs"]
 
     def __getstate__(self):
         return self._path
diff --git a/model_zoo/ernie-3.0/deploy/paddle2onnx/ernie_predictor.py b/model_zoo/ernie-3.0/deploy/paddle2onnx/ernie_predictor.py
@@ -58,18 +58,18 @@ def infer(self, input_dict: dict):
         return result
 
 
-def token_cls_print_ret(infer_result, input_datas):
+def token_cls_print_ret(infer_result, input_data):
     rets = infer_result["value"]
     for i, ret in enumerate(rets):
-        print("input data:", input_datas[i])
+        print("input data:", input_data[i])
         print("The model detects all entities:")
         for iterm in ret:
             print("entity:", iterm["entity"], "  label:", iterm["label"],
                   "  pos:", iterm["pos"])
         print("-----------------------------")
 
 
-def seq_cls_print_ret(infer_result, input_datas):
+def seq_cls_print_ret(infer_result, input_data):
     label_list = [
         "news_story", "news_culture", "news_entertainment", "news_sports",
         "news_finance", "news_house", "news_car", "news_edu", "news_tech",
@@ -79,7 +79,7 @@ def seq_cls_print_ret(infer_result, input_datas):
     label = infer_result["label"].squeeze().tolist()
     confidence = infer_result["confidence"].squeeze().tolist()
     for i, ret in enumerate(infer_result):
-        print("input data:", input_datas[i])
+        print("input data:", input_data[i])
         print("seq cls result:")
         print("label:", label_list[label[i]], "  confidence:", confidence[i])
         print("-----------------------------")
diff --git a/model_zoo/ernie-3.0/deploy/python/ernie_predictor.py b/model_zoo/ernie-3.0/deploy/python/ernie_predictor.py
@@ -143,18 +143,18 @@ def infer(self, input_dict: dict):
         return result
 
 
-def token_cls_print_ret(infer_result, input_datas):
+def token_cls_print_ret(infer_result, input_data):
     rets = infer_result["value"]
     for i, ret in enumerate(rets):
-        print("input data:", input_datas[i])
+        print("input data:", input_data[i])
         print("The model detects all entities:")
         for iterm in ret:
             print("entity:", iterm["entity"], "  label:", iterm["label"],
                   "  pos:", iterm["pos"])
         print("-----------------------------")
 
 
-def seq_cls_print_ret(infer_result, input_datas):
+def seq_cls_print_ret(infer_result, input_data):
     label_list = [
         "news_story", "news_culture", "news_entertainment", "news_sports",
         "news_finance", "news_house", "news_car", "news_edu", "news_tech",
@@ -164,7 +164,7 @@ def seq_cls_print_ret(infer_result, input_datas):
     label = infer_result["label"].squeeze().tolist()
     confidence = infer_result["confidence"].squeeze().tolist()
     for i, ret in enumerate(infer_result):
-        print("input data:", input_datas[i])
+        print("input data:", input_data[i])
         print("seq cls result:")
         print("label:", label_list[label[i]], "  confidence:", confidence[i])
         print("-----------------------------")
diff --git a/model_zoo/ernie-3.0/deploy/serving/seq_cls_rpc_client.py b/model_zoo/ernie-3.0/deploy/serving/seq_cls_rpc_client.py
@@ -41,14 +41,14 @@ def test_tnews_dataset(client):
     idx = 0
     batch_size = 32
     while idx < len(dev_ds):
-        datas = []
+        data = []
         label = []
         for i in range(batch_size):
             if idx + i >= len(dev_ds):
                 break
-            datas.append(dev_ds[idx + i]["sentence"])
+            data.append(dev_ds[idx + i]["sentence"])
             label.append(dev_ds[idx + i]["label"])
-        batches.append(datas)
+        batches.append(data)
         labels.append(np.array(label))
         idx += batch_size
     """
diff --git a/model_zoo/ernie-3.0/deploy/serving/token_cls_rpc_client.py b/model_zoo/ernie-3.0/deploy/serving/token_cls_rpc_client.py
@@ -18,9 +18,9 @@
 import json
 
 
-def print_ret(rets, input_datas):
+def print_ret(rets, input_data):
     for i, ret in enumerate(rets):
-        print("input data:", input_datas[i])
+        print("input data:", input_data[i])
         print("The model detects all entities:")
         for iterm in ret:
             print("entity:", iterm["entity"], "  label:", iterm["label"],
diff --git a/model_zoo/ernie-3.0/deploy/triton/seq_cls_grpc_client.py b/model_zoo/ernie-3.0/deploy/triton/seq_cls_grpc_client.py
@@ -102,14 +102,14 @@ def test_tnews_dataset(runner):
     idx = 0
     batch_size = 32
     while idx < len(dev_ds):
-        datas = []
+        data = []
         label = []
         for i in range(batch_size):
             if idx + i >= len(dev_ds):
                 break
-            datas.append(dev_ds[idx + i]["sentence"])
+            data.append(dev_ds[idx + i]["sentence"])
             label.append(dev_ds[idx + i]["label"])
-        batches.append(datas)
+        batches.append(data)
         labels.append(np.array(label))
         idx += batch_size
 
@@ -126,12 +126,12 @@ def test_tnews_dataset(runner):
     model_version = "1"
     url = "localhost:8001"
     runner = SyncGRPCTritonRunner(url, model_name, model_version)
-    datas = [["你家拆迁，要钱还是要房？答案一目了然", "军嫂探亲拧包入住，部队家属临时来队房标准有了规定，全面落实！"],
+    texts = [["你家拆迁，要钱还是要房？答案一目了然", "军嫂探亲拧包入住，部队家属临时来队房标准有了规定，全面落实！"],
              ["区块链投资心得，能做到就不会亏钱", ]]
 
-    for data in datas:
+    for text in texts:
         # input format:[input1, input2 ... inputn], n = len(self._input_names)
-        result = runner.Run([data])
+        result = runner.Run([text])
         print(result)
 
     test_tnews_dataset(runner)
diff --git a/model_zoo/gpt/dataset.py b/model_zoo/gpt/dataset.py
@@ -300,10 +300,10 @@ def create_pretrained_dataset(
         logger.warning(
             "You are using compatible dataset, please make new dataset as the readme!"
         )
-        process_datas = np.load(
+        process_data = np.load(
             input_prefix + "_ids.npz", mmap_mode="r+", allow_pickle=True)
-        sample_ids = process_datas["ids"]
-        sample_lens = process_datas["lens"].astype("int32")
+        sample_ids = process_data["ids"]
+        sample_lens = process_data["lens"].astype("int32")
     else:
         for suffix in ["_ids.npy", "_idx.npz"]:
             if not os.path.isfile(input_prefix + suffix):
@@ -313,10 +313,10 @@ def create_pretrained_dataset(
             input_prefix + "_ids.npy", mmap_mode="r", allow_pickle=True)
         # All documment ids, extend as 1-D array.
 
-        process_datas = np.load(input_prefix + "_idx.npz")
+        process_data = np.load(input_prefix + "_idx.npz")
         # The len(sample_lens) num of docs
         # The sum(sample_lens) should equal len(sample_ids)
-        sample_lens = process_datas["lens"]
+        sample_lens = process_data["lens"]
 
     splits = get_train_valid_test_split_(args.split, len(sample_lens))
     assert len(sample_lens) >= splits[
diff --git a/model_zoo/uie/deploy/python/uie_predictor.py b/model_zoo/uie/deploy/python/uie_predictor.py
diff --git a/paddlenlp/taskflow/information_extraction.py b/paddlenlp/taskflow/information_extraction.py