Skip to content

Commit 5777fae

Browse files
authored
Merge pull request #2 from PaddlePaddle/develop
rebase
2 parents 021b3a4 + f93af82 commit 5777fae

File tree

24 files changed

+1349
-67
lines changed

24 files changed

+1349
-67
lines changed

.travis.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ before_install:
5050
fi
5151
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
5252
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
53-
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
53+
- pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
5454
script:
5555
- paddle/scripts/travis/main.sh
5656
notifications:

demo/image_classification/predict.sh

100644100755
File mode changed.

demo/semantic_role_labeling/predict.sh

100644100755
File mode changed.

demo/semantic_role_labeling/test.sh

100644100755
File mode changed.

demo/semantic_role_labeling/train.sh

100644100755
File mode changed.

demo/seqToseq/dataprovider.py

+31-10
Original file line numberDiff line numberDiff line change
@@ -19,27 +19,44 @@
1919
END = "<e>"
2020

2121

22-
def hook(settings, src_dict, trg_dict, file_list, **kwargs):
22+
def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
23+
**kwargs):
2324
# job_mode = 1: training mode
2425
# job_mode = 0: generating mode
25-
settings.job_mode = trg_dict is not None
26-
settings.src_dict = src_dict
26+
settings.job_mode = not is_generating
27+
settings.src_dict = dict()
28+
with open(src_dict_path, "r") as fin:
29+
settings.src_dict = {
30+
line.strip(): line_count
31+
for line_count, line in enumerate(fin)
32+
}
33+
settings.trg_dict = dict()
34+
with open(trg_dict_path, "r") as fin:
35+
settings.trg_dict = {
36+
line.strip(): line_count
37+
for line_count, line in enumerate(fin)
38+
}
39+
2740
settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
2841
settings.sample_count = 0
2942

3043
if settings.job_mode:
31-
settings.trg_dict = trg_dict
32-
settings.slots = [
44+
settings.slots = {
45+
'source_language_word':
3346
integer_value_sequence(len(settings.src_dict)),
47+
'target_language_word':
3448
integer_value_sequence(len(settings.trg_dict)),
49+
'target_language_next_word':
3550
integer_value_sequence(len(settings.trg_dict))
36-
]
51+
}
3752
settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
3853
else:
39-
settings.slots = [
54+
settings.slots = {
55+
'source_language_word':
4056
integer_value_sequence(len(settings.src_dict)),
57+
'sent_id':
4158
integer_value_sequence(len(open(file_list[0], "r").readlines()))
42-
]
59+
}
4360

4461

4562
def _get_ids(s, dictionary):
@@ -69,6 +86,10 @@ def process(settings, file_name):
6986
continue
7087
trg_ids_next = trg_ids + [settings.trg_dict[END]]
7188
trg_ids = [settings.trg_dict[START]] + trg_ids
72-
yield src_ids, trg_ids, trg_ids_next
89+
yield {
90+
'source_language_word': src_ids,
91+
'target_language_word': trg_ids,
92+
'target_language_next_word': trg_ids_next
93+
}
7394
else:
74-
yield src_ids, [line_count]
95+
yield {'source_language_word': src_ids, 'sent_id': [line_count]}

demo/seqToseq/seqToseq_net.py

+5-9
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,10 @@ def seq_to_seq_data(data_dir,
3737
"""
3838
src_lang_dict = os.path.join(data_dir, 'src.dict')
3939
trg_lang_dict = os.path.join(data_dir, 'trg.dict')
40-
src_dict = dict()
41-
for line_count, line in enumerate(open(src_lang_dict, "r")):
42-
src_dict[line.strip()] = line_count
43-
trg_dict = dict()
44-
for line_count, line in enumerate(open(trg_lang_dict, "r")):
45-
trg_dict[line.strip()] = line_count
4640

4741
if is_generating:
4842
train_list = None
4943
test_list = os.path.join(data_dir, gen_list)
50-
trg_dict = None
5144
else:
5245
train_list = os.path.join(data_dir, train_list)
5346
test_list = os.path.join(data_dir, test_list)
@@ -57,8 +50,11 @@ def seq_to_seq_data(data_dir,
5750
test_list,
5851
module="dataprovider",
5952
obj="process",
60-
args={"src_dict": src_dict,
61-
"trg_dict": trg_dict})
53+
args={
54+
"src_dict_path": src_lang_dict,
55+
"trg_dict_path": trg_lang_dict,
56+
"is_generating": is_generating
57+
})
6258

6359
return {
6460
"src_dict_path": src_lang_dict,

doc/conf.py.in

+3-4
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ AutoStructify = transform.AutoStructify
2323
# documentation root, use os.path.abspath to make it absolute, like shown here.
2424
sys.path.insert(0, '@PROJ_ROOT@/python')
2525

26-
templates_path = ["@PROJ_ROOT@/doc/templates"]
26+
templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
2727

2828
# -- General configuration ------------------------------------------------
2929

@@ -113,13 +113,12 @@ todo_include_todos = False
113113

114114
# The theme to use for HTML and HTML Help pages. See the documentation for
115115
# a list of builtin themes.
116-
#html_theme = 'sphinx_rtd_theme'
117-
html_theme = 'classic'
116+
html_theme = 'sphinx_rtd_theme'
118117

119118
# Add any paths that contain custom static files (such as style sheets) here,
120119
# relative to this directory. They are copied after the builtin static files,
121120
# so a file named "default.css" will overwrite the builtin "default.css".
122-
html_static_path = ['_static']
121+
html_static_path = ['@PROJ_ROOT@/doc_theme/static']
123122

124123
# Output file base name for HTML help builder.
125124
htmlhelp_basename = project + 'doc'

doc/howto/cmd_parameter/arguments.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
143143
</tr>
144144

145145
<tr>
146-
<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
146+
<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
147147
<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
148148
</tr>
149149

doc/howto/cmd_parameter/detail_introduction.md

+3-7
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
- type: string (default: null).
3232

3333
* `--version`
34-
- Whether to print version infomatrion.
34+
- Whether to print version information.
3535
- type: bool (default: 0).
3636

3737
* `--show_layer_stat`
@@ -110,8 +110,8 @@
110110
- type: int32 (default: -1).
111111

112112
* `--test_period`
113-
- Run testing every test_period train batches. If not set, run testing each pass.
114-
- type: int32 (default: 1000).
113+
- if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
114+
- type: int32 (default: 0).
115115

116116
* `--test_wait`
117117
- Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
@@ -121,10 +121,6 @@
121121
- File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
122122
- type: string (default: "", null).
123123

124-
* `--test_all_data_in_one_period`
125-
- This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
126-
- type: bool (default: 0).
127-
128124
* `--predict_output_dir`
129125
- Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
130126
- type: string (default: "", null).

doc/howto/cmd_parameter/use_case.md

+2-3
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@ paddle train \
1010
--config=network_config \
1111
--save_dir=output \
1212
--trainer_count=COUNT \ #(default:1)
13-
--test_period=M \ #(default:1000)
14-
--test_all_data_in_one_period=true \ #(default:false)
15-
--num_passes=N \ #(defalut:100)
13+
--test_period=M \ #(default:0)
14+
--num_passes=N \ #(defalut:100)
1615
--log_period=K \ #(default:100)
1716
--dot_period=1000 \ #(default:1)
1817
#[--show_parameter_stats_period=100] \ #(default:0)

doc_cn/conf.py.in

+4-4
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ AutoStructify = transform.AutoStructify
2222
# add these directories to sys.path here. If the directory is relative to the
2323
# documentation root, use os.path.abspath to make it absolute, like shown here.
2424
sys.path.insert(0, '@PROJ_ROOT@/python')
25-
templates_path = ["@PROJ_ROOT@/doc/templates"]
25+
templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
2626

2727
# -- General configuration ------------------------------------------------
2828

@@ -112,12 +112,12 @@ todo_include_todos = False
112112

113113
# The theme to use for HTML and HTML Help pages. See the documentation for
114114
# a list of builtin themes.
115-
#html_theme = 'sphinx_rtd_theme' # sphinx_rtd_theme will cause table bad style
116-
html_theme = 'classic'
115+
html_theme = 'sphinx_rtd_theme'
116+
117117
# Add any paths that contain custom static files (such as style sheets) here,
118118
# relative to this directory. They are copied after the builtin static files,
119119
# so a file named "default.css" will overwrite the builtin "default.css".
120-
html_static_path = ['_static']
120+
html_static_path = ['@PROJ_ROOT@/doc_theme/static']
121121

122122
# Output file base name for HTML help builder.
123123
htmlhelp_basename = project + 'doc'

doc_cn/faq/index.rst

+38
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,41 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字
214214
cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path> -DPYTHON_INCLUDE_DIR=<inc_path>
215215
216216
用户需要指定本机上Python的路径:``<exc_path>``, ``<lib_path>``, ``<inc_path>``
217+
218+
10. A protocol message was rejected because it was too big
219+
----------------------------------------------------------
220+
221+
如果在训练NLP相关模型时,出现以下错误:
222+
223+
.. code-block:: bash
224+
225+
[libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
226+
F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
227+
228+
可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似:
229+
230+
.. code-block:: python
231+
232+
src_dict = dict()
233+
for line_count, line in enumerate(open(src_dict_path, "r")):
234+
src_dict[line.strip()] = line_count
235+
236+
define_py_data_sources2(
237+
train_list,
238+
test_list,
239+
module="dataprovider",
240+
obj="process",
241+
args={"src_dict": src_dict})
242+
243+
解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为:
244+
245+
.. code-block:: python
246+
247+
define_py_data_sources2(
248+
train_list,
249+
test_list,
250+
module="dataprovider",
251+
obj="process",
252+
args={"src_dict_path": src_dict_path})
253+
254+
完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。

0 commit comments

Comments
 (0)