Skip to content

fix typo #316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions paddlehub/dataset/base_cv_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from paddlehub.common.logger import logger


class BaseCVDatast(BaseDataset):
class BaseCVDataset(BaseDataset):
def __init__(self,
base_path,
train_list_file=None,
Expand All @@ -35,7 +35,7 @@ def __init__(self,
predict_list_file=None,
label_list_file=None,
label_list=None):
super(BaseCVDatast, self).__init__(
super(BaseCVDataset, self).__init__(
base_path=base_path,
train_file=train_list_file,
dev_file=validate_list_file,
Expand Down Expand Up @@ -65,7 +65,7 @@ def _read_file(self, data_path, phase=None):
return data


# discarded. please use BaseCVDatast
# discarded. please use BaseCVDataset
class ImageClassificationDataset(object):
def __init__(self):
logger.warning(
Expand Down
66 changes: 42 additions & 24 deletions paddlehub/dataset/base_nlp_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,10 @@
import csv

from paddlehub.dataset import InputExample, BaseDataset
from paddlehub.common.logger import logger


class BaseNLPDatast(BaseDataset):
class BaseNLPDataset(BaseDataset):
def __init__(self,
base_path,
train_file=None,
Expand All @@ -32,49 +33,66 @@ def __init__(self,
predict_file=None,
label_file=None,
label_list=None,
train_file_with_head=False,
dev_file_with_head=False,
test_file_with_head=False,
predict_file_with_head=False):
super(BaseNLPDatast, self).__init__(
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
predict_file_with_header=False):
super(BaseNLPDataset, self).__init__(
base_path=base_path,
train_file=train_file,
dev_file=dev_file,
test_file=test_file,
predict_file=predict_file,
label_file=label_file,
label_list=label_list,
train_file_with_head=train_file_with_head,
dev_file_with_head=dev_file_with_head,
test_file_with_head=test_file_with_head,
predict_file_with_head=predict_file_with_head)
train_file_with_header=train_file_with_header,
dev_file_with_header=dev_file_with_header,
test_file_with_header=test_file_with_header,
predict_file_with_header=predict_file_with_header)

def _read_file(self, input_file, phase=None):
"""Reads a tab separated value file."""
has_warned = False
with io.open(input_file, "r", encoding="UTF-8") as file:
reader = csv.reader(file, delimiter="\t", quotechar=None)
examples = []
for (i, line) in enumerate(reader):
if i == 0:
ncol = len(line)
if self.if_file_with_head[phase]:
if self.if_file_with_header[phase]:
continue
if ncol == 1:
if phase != "predict":
example = InputExample(guid=i, text_a=line[0])
else:
if phase != "predict":
if ncol == 1:
raise Exception(
"the %s file: %s only has one column but it is not a predict file"
% (phase, input_file))
elif ncol == 2:
example = InputExample(
guid=i, text_a=line[0], label=line[1])
elif ncol == 3:
example = InputExample(
guid=i, text_a=line[0], text_b=line[1], label=line[2])
elif ncol == 2:
example = InputExample(
guid=i, text_a=line[0], label=line[1])
elif ncol == 3:
example = InputExample(
guid=i,
text_a=line[0],
text_b=line[1],
label=line[2])
else:
raise Exception(
"the %s file: %s has too many columns (should <=3)"
% (phase, input_file))
else:
raise Exception(
"the %s file: %s has too many columns (should <=3)" %
(phase, input_file))
if ncol == 1:
example = InputExample(guid=i, text_a=line[0])
elif ncol == 2:
if not has_warned:
logger.warning(
"the predict file: %s has 2 columns, as it is a predict file, the second one will be regarded as text_b"
% (input_file))
has_warned = True
example = InputExample(
guid=i, text_a=line[0], text_b=line[1])
else:
raise Exception(
"the predict file: %s has too many columns (should <=2)"
% (input_file))
examples.append(example)
return examples
4 changes: 2 additions & 2 deletions paddlehub/dataset/bq.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import os

from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset


class BQ(BaseNLPDatast):
class BQ(BaseNLPDataset):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add dataset comments, like MSRA_NER the other dataset.

def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "bq")
base_path = self._download_dataset(
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/chnsenticorp.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@

from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset


class ChnSentiCorp(BaseNLPDatast):
class ChnSentiCorp(BaseNLPDataset):
"""
ChnSentiCorp (by Tan Songbo at ICT of Chinese Academy of Sciences, and for
opinion mining)
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/cmrc2018.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/cmrc2018.tar.gz"
SPIECE_UNDERLINE = '▁'
Expand Down Expand Up @@ -62,7 +62,7 @@ def __repr__(self):
return s


class CMRC2018(BaseNLPDatast):
class CMRC2018(BaseNLPDataset):
"""A single set of features of data."""

def __init__(self):
Expand Down
18 changes: 9 additions & 9 deletions paddlehub/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,10 @@ def __init__(self,
predict_file=None,
label_file=None,
label_list=None,
train_file_with_head=False,
dev_file_with_head=False,
test_file_with_head=False,
predict_file_with_head=False):
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
predict_file_with_header=False):
if not (train_file or dev_file or test_file):
raise ValueError("At least one file should be assigned")
self.base_path = base_path
Expand All @@ -83,11 +83,11 @@ def __init__(self,
self.test_examples = []
self.predict_examples = []

self.if_file_with_head = {
"train": train_file_with_head,
"dev": dev_file_with_head,
"test": test_file_with_head,
"predict": predict_file_with_head
self.if_file_with_header = {
"train": train_file_with_header,
"dev": dev_file_with_header,
"test": test_file_with_header,
"predict": predict_file_with_header
}

if train_file:
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/dogcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import os

import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset


class DogCatDataset(BaseCVDatast):
class DogCatDataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "dog-cat")
base_path = self._download_dataset(
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/drcd.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/drcd.tar.gz"
SPIECE_UNDERLINE = '▁'
Expand Down Expand Up @@ -62,7 +62,7 @@ def __repr__(self):
return s


class DRCD(BaseNLPDatast):
class DRCD(BaseNLPDataset):
"""A single set of features of data."""

def __init__(self):
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/flowers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import os

import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset


class FlowersDataset(BaseCVDatast):
class FlowersDataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "flower_photos")
base_path = self._download_dataset(
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/food101.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import os

import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset


class Food101Dataset(BaseCVDatast):
class Food101Dataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "food-101",
"images")
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
from paddlehub.dataset import InputExample
from paddlehub.common.logger import logger
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/glue_data.tar.gz"


class GLUE(BaseNLPDatast):
class GLUE(BaseNLPDataset):
"""
Please refer to
https://gluebenchmark.com
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/iflytek.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@

from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/iflytek.tar.gz"


class IFLYTEK(BaseNLPDatast):
class IFLYTEK(BaseNLPDataset):
def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "iflytek")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/indoor67.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import os

import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset


class Indoor67Dataset(BaseCVDatast):
class Indoor67Dataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME, "Indoor67")
base_path = self._download_dataset(
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/inews.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/inews.tar.gz"


class INews(BaseNLPDatast):
class INews(BaseNLPDataset):
"""
INews is a sentiment analysis dataset for Internet News
"""
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/lcqmc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/lcqmc.tar.gz"


class LCQMC(BaseNLPDatast):
class LCQMC(BaseNLPDataset):
def __init__(self):
dataset_dir = os.path.join(DATA_HOME, "lcqmc")
base_path = self._download_dataset(dataset_dir, url=_DATA_URL)
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/msra_ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/msra_ner.tar.gz"


class MSRA_NER(BaseNLPDatast):
class MSRA_NER(BaseNLPDataset):
"""
A set of manually annotated Chinese word-segmentation data and
specifications for training and testing a Chinese word-segmentation system
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/nlpcc_dbqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@

from paddlehub.dataset import InputExample
from paddlehub.common.dir import DATA_HOME
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/nlpcc-dbqa.tar.gz"


class NLPCC_DBQA(BaseNLPDatast):
class NLPCC_DBQA(BaseNLPDataset):
"""
Please refer to
http://tcci.ccf.org.cn/conference/2017/dldoc/taskgline05.pdf
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from paddlehub.reader import tokenization
from paddlehub.common.dir import DATA_HOME
from paddlehub.common.logger import logger
from paddlehub.dataset.base_nlp_dataset import BaseNLPDatast
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset

_DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz"

Expand Down Expand Up @@ -65,7 +65,7 @@ def __repr__(self):
return s


class SQUAD(BaseNLPDatast):
class SQUAD(BaseNLPDataset):
"""A single set of features of data."""

def __init__(self, version_2_with_negative=False):
Expand Down
4 changes: 2 additions & 2 deletions paddlehub/dataset/stanford_dogs.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@
import os

import paddlehub as hub
from paddlehub.dataset.base_cv_dataset import BaseCVDatast
from paddlehub.dataset.base_cv_dataset import BaseCVDataset


class StanfordDogsDataset(BaseCVDatast):
class StanfordDogsDataset(BaseCVDataset):
def __init__(self):
dataset_path = os.path.join(hub.common.dir.DATA_HOME,
"StanfordDogs-120")
Expand Down
Loading