Skip to content

Commit 90bbb04

Browse files
kinghuinnepeplwu
authored andcommitted
fix typo (#316)
* fix typo * enhance predict dataset
1 parent 3964311 commit 90bbb04

23 files changed

+93
-75
lines changed

paddlehub/dataset/base_cv_dataset.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from paddlehub.common.logger import logger
2727

2828

29-
class BaseCVDatast(BaseDataset):
29+
class BaseCVDataset(BaseDataset):
3030
def __init__(self,
3131
base_path,
3232
train_list_file=None,
@@ -35,7 +35,7 @@ def __init__(self,
3535
predict_list_file=None,
3636
label_list_file=None,
3737
label_list=None):
38-
super(BaseCVDatast, self).__init__(
38+
super(BaseCVDataset, self).__init__(
3939
base_path=base_path,
4040
train_file=train_list_file,
4141
dev_file=validate_list_file,
@@ -65,7 +65,7 @@ def _read_file(self, data_path, phase=None):
6565
return data
6666

6767

68-
# discarded. please use BaseCVDatast
68+
# discarded. please use BaseCVDataset
6969
class ImageClassificationDataset(object):
7070
def __init__(self):
7171
logger.warning(

paddlehub/dataset/base_nlp_dataset.py

+42-24
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,10 @@
2121
import csv
2222

2323
from paddlehub.dataset import InputExample, BaseDataset
24+
from paddlehub.common.logger import logger
2425

2526

26-
class BaseNLPDatast(BaseDataset):
27+
class BaseNLPDataset(BaseDataset):
2728
def __init__(self,
2829
base_path,
2930
train_file=None,
@@ -32,49 +33,66 @@ def __init__(self,
3233
predict_file=None,
3334
label_file=None,
3435
label_list=None,
35-
train_file_with_head=False,
36-
dev_file_with_head=False,
37-
test_file_with_head=False,
38-
predict_file_with_head=False):
39-
super(BaseNLPDatast, self).__init__(
36+
train_file_with_header=False,
37+
dev_file_with_header=False,
38+
test_file_with_header=False,
39+
predict_file_with_header=False):
40+
super(BaseNLPDataset, self).__init__(
4041
base_path=base_path,
4142
train_file=train_file,
4243
dev_file=dev_file,
4344
test_file=test_file,
4445
predict_file=predict_file,
4546
label_file=label_file,
4647
label_list=label_list,
47-
train_file_with_head=train_file_with_head,
48-
dev_file_with_head=dev_file_with_head,
49-
test_file_with_head=test_file_with_head,
50-
predict_file_with_head=predict_file_with_head)
48+
train_file_with_header=train_file_with_header,
49+
dev_file_with_header=dev_file_with_header,
50+
test_file_with_header=test_file_with_header,
51+
predict_file_with_header=predict_file_with_header)
5152

5253
def _read_file(self, input_file, phase=None):
5354
"""Reads a tab separated value file."""
55+
has_warned = False
5456
with io.open(input_file, "r", encoding="UTF-8") as file:
5557
reader = csv.reader(file, delimiter="\t", quotechar=None)
5658
examples = []
5759
for (i, line) in enumerate(reader):
5860
if i == 0:
5961
ncol = len(line)
60-
if self.if_file_with_head[phase]:
62+
if self.if_file_with_header[phase]:
6163
continue
62-
if ncol == 1:
63-
if phase != "predict":
64-
example = InputExample(guid=i, text_a=line[0])
65-
else:
64+
if phase != "predict":
65+
if ncol == 1:
6666
raise Exception(
6767
"the %s file: %s only has one column but it is not a predict file"
6868
% (phase, input_file))
69-
elif ncol == 2:
70-
example = InputExample(
71-
guid=i, text_a=line[0], label=line[1])
72-
elif ncol == 3:
73-
example = InputExample(
74-
guid=i, text_a=line[0], text_b=line[1], label=line[2])
69+
elif ncol == 2:
70+
example = InputExample(
71+
guid=i, text_a=line[0], label=line[1])
72+
elif ncol == 3:
73+
example = InputExample(
74+
guid=i,
75+
text_a=line[0],
76+
text_b=line[1],
77+
label=line[2])
78+
else:
79+
raise Exception(
80+
"the %s file: %s has too many columns (should <=3)"
81+
% (phase, input_file))
7582
else:
76-
raise Exception(
77-
"the %s file: %s has too many columns (should <=3)" %
78-
(phase, input_file))
83+
if ncol == 1:
84+
example = InputExample(guid=i, text_a=line[0])
85+
elif ncol == 2:
86+
if not has_warned:
87+
logger.warning(
88+
"the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
89+
% (input_file))
90+
has_warned = True
91+
example = InputExample(
92+
guid=i, text_a=line[0], text_b=line[1])
93+
else:
94+
raise Exception(
95+
"the predict file: %s has too many columns (should <=2)"
96+
% (input_file))
7997
examples.append(example)
8098
return examples

paddlehub/dataset/bq.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import os
2121

2222
from paddlehub.common.dir import DATA_HOME
23-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
23+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2424

2525

26-
class BQ(BaseNLPDatast):
26+
class BQ(BaseNLPDataset):
2727
def __init__(self):
2828
dataset_dir = os.path.join(DATA_HOME, "bq")
2929
base_path = self._download_dataset(

paddlehub/dataset/chnsenticorp.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,10 @@
2323

2424
from paddlehub.dataset import InputExample
2525
from paddlehub.common.dir import DATA_HOME
26-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
26+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2727

2828

29-
class ChnSentiCorp(BaseNLPDatast):
29+
class ChnSentiCorp(BaseNLPDataset):
3030
"""
3131
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
3232
opinion mining)

paddlehub/dataset/cmrc2018.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from paddlehub.reader import tokenization
2121
from paddlehub.common.dir import DATA_HOME
2222
from paddlehub.common.logger import logger
23-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
23+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2424

2525
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
2626
SPIECE_UNDERLINE = '▁'
@@ -62,7 +62,7 @@ def __repr__(self):
6262
return s
6363

6464

65-
class CMRC2018(BaseNLPDatast):
65+
class CMRC2018(BaseNLPDataset):
6666
"""A single set of features of data."""
6767

6868
def __init__(self):

paddlehub/dataset/dataset.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ def __init__(self,
6464
predict_file=None,
6565
label_file=None,
6666
label_list=None,
67-
train_file_with_head=False,
68-
dev_file_with_head=False,
69-
test_file_with_head=False,
70-
predict_file_with_head=False):
67+
train_file_with_header=False,
68+
dev_file_with_header=False,
69+
test_file_with_header=False,
70+
predict_file_with_header=False):
7171
if not (train_file or dev_file or test_file):
7272
raise ValueError("At least one file should be assigned")
7373
self.base_path = base_path
@@ -83,11 +83,11 @@ def __init__(self,
8383
self.test_examples = []
8484
self.predict_examples = []
8585

86-
self.if_file_with_head = {
87-
"train": train_file_with_head,
88-
"dev": dev_file_with_head,
89-
"test": test_file_with_head,
90-
"predict": predict_file_with_head
86+
self.if_file_with_header = {
87+
"train": train_file_with_header,
88+
"dev": dev_file_with_header,
89+
"test": test_file_with_header,
90+
"predict": predict_file_with_header
9191
}
9292

9393
if train_file:

paddlehub/dataset/dogcat.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import os
2121

2222
import paddlehub as hub
23-
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
23+
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
2424

2525

26-
class DogCatDataset(BaseCVDatast):
26+
class DogCatDataset(BaseCVDataset):
2727
def __init__(self):
2828
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat")
2929
base_path = self._download_dataset(

paddlehub/dataset/drcd.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from paddlehub.reader import tokenization
2121
from paddlehub.common.dir import DATA_HOME
2222
from paddlehub.common.logger import logger
23-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
23+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2424

2525
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
2626
SPIECE_UNDERLINE = '▁'
@@ -62,7 +62,7 @@ def __repr__(self):
6262
return s
6363

6464

65-
class DRCD(BaseNLPDatast):
65+
class DRCD(BaseNLPDataset):
6666
"""A single set of features of data."""
6767

6868
def __init__(self):

paddlehub/dataset/flowers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import os
2121

2222
import paddlehub as hub
23-
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
23+
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
2424

2525

26-
class FlowersDataset(BaseCVDatast):
26+
class FlowersDataset(BaseCVDataset):
2727
def __init__(self):
2828
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos")
2929
base_path = self._download_dataset(

paddlehub/dataset/food101.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import os
2121

2222
import paddlehub as hub
23-
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
23+
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
2424

2525

26-
class Food101Dataset(BaseCVDatast):
26+
class Food101Dataset(BaseCVDataset):
2727
def __init__(self):
2828
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101",
2929
"images")

paddlehub/dataset/glue.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,12 @@
2424
from paddlehub.dataset import InputExample
2525
from paddlehub.common.logger import logger
2626
from paddlehub.common.dir import DATA_HOME
27-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
27+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2828

2929
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"
3030

3131

32-
class GLUE(BaseNLPDatast):
32+
class GLUE(BaseNLPDataset):
3333
"""
3434
Please refer to
3535
https://gluebenchmark.com

paddlehub/dataset/iflytek.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@
2222

2323
from paddlehub.dataset import InputExample
2424
from paddlehub.common.dir import DATA_HOME
25-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
25+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2626

2727
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"
2828

2929

30-
class IFLYTEK(BaseNLPDatast):
30+
class IFLYTEK(BaseNLPDataset):
3131
def __init__(self):
3232
dataset_dir = os.path.join(DATA_HOME, "iflytek")
3333
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)

paddlehub/dataset/indoor67.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import os
2121

2222
import paddlehub as hub
23-
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
23+
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
2424

2525

26-
class Indoor67Dataset(BaseCVDatast):
26+
class Indoor67Dataset(BaseCVDataset):
2727
def __init__(self):
2828
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67")
2929
base_path = self._download_dataset(

paddlehub/dataset/inews.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@
2323

2424
from paddlehub.dataset import InputExample
2525
from paddlehub.common.dir import DATA_HOME
26-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
26+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2727

2828
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"
2929

3030

31-
class INews(BaseNLPDatast):
31+
class INews(BaseNLPDataset):
3232
"""
3333
INews is a sentiment analysis dataset for Internet News
3434
"""

paddlehub/dataset/lcqmc.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@
2323

2424
from paddlehub.dataset import InputExample
2525
from paddlehub.common.dir import DATA_HOME
26-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
26+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2727

2828
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"
2929

3030

31-
class LCQMC(BaseNLPDatast):
31+
class LCQMC(BaseNLPDataset):
3232
def __init__(self):
3333
dataset_dir = os.path.join(DATA_HOME, "lcqmc")
3434
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)

paddlehub/dataset/msra_ner.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@
2323

2424
from paddlehub.dataset import InputExample
2525
from paddlehub.common.dir import DATA_HOME
26-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
26+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2727

2828
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"
2929

3030

31-
class MSRA_NER(BaseNLPDatast):
31+
class MSRA_NER(BaseNLPDataset):
3232
"""
3333
A set of manually annotated Chinese word-segmentation data and
3434
specifications for training and testing a Chinese word-segmentation system

paddlehub/dataset/nlpcc_dbqa.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,12 @@
2323

2424
from paddlehub.dataset import InputExample
2525
from paddlehub.common.dir import DATA_HOME
26-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
26+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2727

2828
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"
2929

3030

31-
class NLPCC_DBQA(BaseNLPDatast):
31+
class NLPCC_DBQA(BaseNLPDataset):
3232
"""
3333
Please refer to
3434
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf

paddlehub/dataset/squad.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from paddlehub.reader import tokenization
2121
from paddlehub.common.dir import DATA_HOME
2222
from paddlehub.common.logger import logger
23-
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
23+
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
2424

2525
_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"
2626

@@ -65,7 +65,7 @@ def __repr__(self):
6565
return s
6666

6767

68-
class SQUAD(BaseNLPDatast):
68+
class SQUAD(BaseNLPDataset):
6969
"""A single set of features of data."""
7070

7171
def __init__(self, version_2_with_negative=False):

paddlehub/dataset/stanford_dogs.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import os
2121

2222
import paddlehub as hub
23-
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
23+
from paddlehub.dataset.base_cv_dataset import BaseCVDataset
2424

2525

26-
class StanfordDogsDataset(BaseCVDatast):
26+
class StanfordDogsDataset(BaseCVDataset):
2727
def __init__(self):
2828
dataset_path = os.path.join(hub.common.dir.DATA_HOME,
2929
"StanfordDogs-120")

0 commit comments

Comments
 (0)