PaddlePaddle · nepeplwu · Jan 8, 2020 · Jan 7, 2020 · Jan 7, 2020 · Jan 7, 2020
diff --git a/paddlehub/dataset/base_cv_dataset.py b/paddlehub/dataset/base_cv_dataset.py
@@ -26,7 +26,7 @@
 from paddlehub.common.logger import logger
 
 
-class BaseCVDatast(BaseDataset):
+class BaseCVDataset(BaseDataset):
     def __init__(self,
                  base_path,
                  train_list_file=None,
@@ -35,7 +35,7 @@ def __init__(self,
                  predict_list_file=None,
                  label_list_file=None,
                  label_list=None):
-        super(BaseCVDatast, self).__init__(
+        super(BaseCVDataset, self).__init__(
             base_path=base_path,
             train_file=train_list_file,
             dev_file=validate_list_file,
@@ -65,7 +65,7 @@ def _read_file(self, data_path, phase=None):
         return data
 
 
-# discarded. please use BaseCVDatast
+# discarded. please use BaseCVDataset
 class ImageClassificationDataset(object):
     def __init__(self):
         logger.warning(

diff --git a/paddlehub/dataset/base_nlp_dataset.py b/paddlehub/dataset/base_nlp_dataset.py
@@ -21,9 +21,10 @@
 import csv
 
 from paddlehub.dataset import InputExample, BaseDataset
+from paddlehub.common.logger import logger
 
 
-class BaseNLPDatast(BaseDataset):
+class BaseNLPDataset(BaseDataset):
     def __init__(self,
                  base_path,
                  train_file=None,
@@ -32,49 +33,66 @@ def __init__(self,
                  predict_file=None,
                  label_file=None,
                  label_list=None,
-                 train_file_with_head=False,
-                 dev_file_with_head=False,
-                 test_file_with_head=False,
-                 predict_file_with_head=False):
-        super(BaseNLPDatast, self).__init__(
+                 train_file_with_header=False,
+                 dev_file_with_header=False,
+                 test_file_with_header=False,
+                 predict_file_with_header=False):
+        super(BaseNLPDataset, self).__init__(
             base_path=base_path,
             train_file=train_file,
             dev_file=dev_file,
             test_file=test_file,
             predict_file=predict_file,
             label_file=label_file,
             label_list=label_list,
-            train_file_with_head=train_file_with_head,
-            dev_file_with_head=dev_file_with_head,
-            test_file_with_head=test_file_with_head,
-            predict_file_with_head=predict_file_with_head)
+            train_file_with_header=train_file_with_header,
+            dev_file_with_header=dev_file_with_header,
+            test_file_with_header=test_file_with_header,
+            predict_file_with_header=predict_file_with_header)
 
     def _read_file(self, input_file, phase=None):
         """Reads a tab separated value file."""
+        has_warned = False
         with io.open(input_file, "r", encoding="UTF-8") as file:
             reader = csv.reader(file, delimiter="\t", quotechar=None)
             examples = []
             for (i, line) in enumerate(reader):
                 if i == 0:
                     ncol = len(line)
-                    if self.if_file_with_head[phase]:
+                    if self.if_file_with_header[phase]:
                         continue
-                if ncol == 1:
-                    if phase != "predict":
-                        example = InputExample(guid=i, text_a=line[0])
-                    else:
+                if phase != "predict":
+                    if ncol == 1:
                         raise Exception(
                             "the %s file: %s only has one column but it is not a predict file"
                             % (phase, input_file))
-                elif ncol == 2:
-                    example = InputExample(
-                        guid=i, text_a=line[0], label=line[1])
-                elif ncol == 3:
-                    example = InputExample(
-                        guid=i, text_a=line[0], text_b=line[1], label=line[2])
+                    elif ncol == 2:
+                        example = InputExample(
+                            guid=i, text_a=line[0], label=line[1])
+                    elif ncol == 3:
+                        example = InputExample(
+                            guid=i,
+                            text_a=line[0],
+                            text_b=line[1],
+                            label=line[2])
+                    else:
+                        raise Exception(
+                            "the %s file: %s has too many columns (should <=3)"
+                            % (phase, input_file))
                 else:
-                    raise Exception(
-                        "the %s file: %s has too many columns (should <=3)" %
-                        (phase, input_file))
+                    if ncol == 1:
+                        example = InputExample(guid=i, text_a=line[0])
+                    elif ncol == 2:
+                        if not has_warned:
+                            logger.warning(
+                                "the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
+                                % (input_file))
+                            has_warned = True
+                        example = InputExample(
+                            guid=i, text_a=line[0], text_b=line[1])
+                    else:
+                        raise Exception(
+                            "the predict file: %s has too many columns (should <=2)"
+                            % (input_file))
                 examples.append(example)
             return examples
diff --git a/paddlehub/dataset/bq.py b/paddlehub/dataset/bq.py
@@ -20,10 +20,10 @@
 import os
 
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 
-class BQ(BaseNLPDatast):
+class BQ(BaseNLPDataset):
     def __init__(self):
         dataset_dir = os.path.join(DATA_HOME, "bq")
         base_path = self._download_dataset(

diff --git a/paddlehub/dataset/chnsenticorp.py b/paddlehub/dataset/chnsenticorp.py
@@ -23,10 +23,10 @@
 
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 
-class ChnSentiCorp(BaseNLPDatast):
+class ChnSentiCorp(BaseNLPDataset):
     """
     ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
     opinion mining)

diff --git a/paddlehub/dataset/cmrc2018.py b/paddlehub/dataset/cmrc2018.py
@@ -20,7 +20,7 @@
 from paddlehub.reader import tokenization
 from paddlehub.common.dir import DATA_HOME
 from paddlehub.common.logger import logger
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
 SPIECE_UNDERLINE = '▁'
@@ -62,7 +62,7 @@ def __repr__(self):
         return s
 
 
-class CMRC2018(BaseNLPDatast):
+class CMRC2018(BaseNLPDataset):
     """A single set of features of data."""
 
     def __init__(self):

diff --git a/paddlehub/dataset/dataset.py b/paddlehub/dataset/dataset.py
@@ -64,10 +64,10 @@ def __init__(self,
                  predict_file=None,
                  label_file=None,
                  label_list=None,
-                 train_file_with_head=False,
-                 dev_file_with_head=False,
-                 test_file_with_head=False,
-                 predict_file_with_head=False):
+                 train_file_with_header=False,
+                 dev_file_with_header=False,
+                 test_file_with_header=False,
+                 predict_file_with_header=False):
         if not (train_file or dev_file or test_file):
             raise ValueError("At least one file should be assigned")
         self.base_path = base_path
@@ -83,11 +83,11 @@ def __init__(self,
         self.test_examples = []
         self.predict_examples = []
 
-        self.if_file_with_head = {
-            "train": train_file_with_head,
-            "dev": dev_file_with_head,
-            "test": test_file_with_head,
-            "predict": predict_file_with_head
+        self.if_file_with_header = {
+            "train": train_file_with_header,
+            "dev": dev_file_with_header,
+            "test": test_file_with_header,
+            "predict": predict_file_with_header
         }
 
         if train_file:

diff --git a/paddlehub/dataset/dogcat.py b/paddlehub/dataset/dogcat.py
@@ -20,10 +20,10 @@
 import os
 
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
 
 
-class DogCatDataset(BaseCVDatast):
+class DogCatDataset(BaseCVDataset):
     def __init__(self):
         dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat")
         base_path = self._download_dataset(

diff --git a/paddlehub/dataset/drcd.py b/paddlehub/dataset/drcd.py
@@ -20,7 +20,7 @@
 from paddlehub.reader import tokenization
 from paddlehub.common.dir import DATA_HOME
 from paddlehub.common.logger import logger
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
 SPIECE_UNDERLINE = '▁'
@@ -62,7 +62,7 @@ def __repr__(self):
         return s
 
 
-class DRCD(BaseNLPDatast):
+class DRCD(BaseNLPDataset):
     """A single set of features of data."""
 
     def __init__(self):

diff --git a/paddlehub/dataset/flowers.py b/paddlehub/dataset/flowers.py
@@ -20,10 +20,10 @@
 import os
 
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
 
 
-class FlowersDataset(BaseCVDatast):
+class FlowersDataset(BaseCVDataset):
     def __init__(self):
         dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos")
         base_path = self._download_dataset(

diff --git a/paddlehub/dataset/food101.py b/paddlehub/dataset/food101.py
@@ -20,10 +20,10 @@
 import os
 
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
 
 
-class Food101Dataset(BaseCVDatast):
+class Food101Dataset(BaseCVDataset):
     def __init__(self):
         dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101",
                                     "images")

diff --git a/paddlehub/dataset/glue.py b/paddlehub/dataset/glue.py
@@ -24,12 +24,12 @@
 from paddlehub.dataset import InputExample
 from paddlehub.common.logger import logger
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"
 
 
-class GLUE(BaseNLPDatast):
+class GLUE(BaseNLPDataset):
     """
     Please refer to
     https://gluebenchmark.com

diff --git a/paddlehub/dataset/iflytek.py b/paddlehub/dataset/iflytek.py
@@ -22,12 +22,12 @@
 
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
 
 
-class IFLYTEK(BaseNLPDatast):
+class IFLYTEK(BaseNLPDataset):
     def __init__(self):
         dataset_dir = os.path.join(DATA_HOME, "iflytek")
         base_path = self._download_dataset(dataset_dir, url=_DATA_URL)

diff --git a/paddlehub/dataset/indoor67.py b/paddlehub/dataset/indoor67.py
@@ -20,10 +20,10 @@
 import os
 
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
 
 
-class Indoor67Dataset(BaseCVDatast):
+class Indoor67Dataset(BaseCVDataset):
     def __init__(self):
         dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67")
         base_path = self._download_dataset(

diff --git a/paddlehub/dataset/inews.py b/paddlehub/dataset/inews.py
@@ -23,12 +23,12 @@
 
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
 
 
-class INews(BaseNLPDatast):
+class INews(BaseNLPDataset):
     """
     INews is a sentiment analysis dataset for Internet News
     """

diff --git a/paddlehub/dataset/lcqmc.py b/paddlehub/dataset/lcqmc.py
@@ -23,12 +23,12 @@
 
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
 
 
-class LCQMC(BaseNLPDatast):
+class LCQMC(BaseNLPDataset):
     def __init__(self):
         dataset_dir = os.path.join(DATA_HOME, "lcqmc")
         base_path = self._download_dataset(dataset_dir, url=_DATA_URL)

diff --git a/paddlehub/dataset/msra_ner.py b/paddlehub/dataset/msra_ner.py
@@ -23,12 +23,12 @@
 
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
 
 
-class MSRA_NER(BaseNLPDatast):
+class MSRA_NER(BaseNLPDataset):
     """
     A set of manually annotated Chinese word-segmentation data and
     specifications for training and testing a Chinese word-segmentation system

diff --git a/paddlehub/dataset/nlpcc_dbqa.py b/paddlehub/dataset/nlpcc_dbqa.py
@@ -23,12 +23,12 @@
 
 from paddlehub.dataset import InputExample
 from paddlehub.common.dir import DATA_HOME
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
 
 
-class NLPCC_DBQA(BaseNLPDatast):
+class NLPCC_DBQA(BaseNLPDataset):
     """
     Please refer to
     http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf

diff --git a/paddlehub/dataset/squad.py b/paddlehub/dataset/squad.py
@@ -20,7 +20,7 @@
 from paddlehub.reader import tokenization
 from paddlehub.common.dir import DATA_HOME
 from paddlehub.common.logger import logger
-from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
+from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
 
 _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
 
@@ -65,7 +65,7 @@ def __repr__(self):
         return s
 
 
-class SQUAD(BaseNLPDatast):
+class SQUAD(BaseNLPDataset):
     """A single set of features of data."""
 
     def __init__(self, version_2_with_negative=False):

diff --git a/paddlehub/dataset/stanford_dogs.py b/paddlehub/dataset/stanford_dogs.py
@@ -20,10 +20,10 @@
 import os
 
 import paddlehub as hub
-from paddlehub.dataset.base_cv_dataset import BaseCVDatast
+from paddlehub.dataset.base_cv_dataset import BaseCVDataset
 
 
-class StanfordDogsDataset(BaseCVDatast):
+class StanfordDogsDataset(BaseCVDataset):
     def __init__(self):
         dataset_path = os.path.join(hub.common.dir.DATA_HOME,
                                     "StanfordDogs-120")