@@ -233,32 +233,28 @@ dict_size = len(word_dict)
233
233
- 我们来定义我们的 N-gram 神经网络结构。这个结构在训练和预测中都会使用到。因为词向量比较稀疏,我们传入参数 ` is_sparse == True ` , 可以加速稀疏矩阵的更新。
234
234
235
235
``` python
236
- def inference_program (is_sparse ):
237
- first_word = fluid.layers.data(name = ' firstw' , shape = [1 ], dtype = ' int64' )
238
- second_word = fluid.layers.data(name = ' secondw' , shape = [1 ], dtype = ' int64' )
239
- third_word = fluid.layers.data(name = ' thirdw' , shape = [1 ], dtype = ' int64' )
240
- fourth_word = fluid.layers.data(name = ' fourthw' , shape = [1 ], dtype = ' int64' )
236
+ def inference_program (words , is_sparse ):
241
237
242
238
embed_first = fluid.layers.embedding(
243
- input = first_word ,
239
+ input = words[ 0 ] ,
244
240
size = [dict_size, EMBED_SIZE ],
245
241
dtype = ' float32' ,
246
242
is_sparse = is_sparse,
247
243
param_attr = ' shared_w' )
248
244
embed_second = fluid.layers.embedding(
249
- input = second_word ,
245
+ input = words[ 1 ] ,
250
246
size = [dict_size, EMBED_SIZE ],
251
247
dtype = ' float32' ,
252
248
is_sparse = is_sparse,
253
249
param_attr = ' shared_w' )
254
250
embed_third = fluid.layers.embedding(
255
- input = third_word ,
251
+ input = words[ 2 ] ,
256
252
size = [dict_size, EMBED_SIZE ],
257
253
dtype = ' float32' ,
258
254
is_sparse = is_sparse,
259
255
param_attr = ' shared_w' )
260
256
embed_fourth = fluid.layers.embedding(
261
- input = fourth_word ,
257
+ input = words[ 3 ] ,
262
258
size = [dict_size, EMBED_SIZE ],
263
259
dtype = ' float32' ,
264
260
is_sparse = is_sparse,
@@ -276,74 +272,112 @@ def inference_program(is_sparse):
276
272
- 基于以上的神经网络结构,我们可以如下定义我们的` 训练 ` 方法
277
273
278
274
``` python
279
- def train_program (is_sparse ):
275
+ def train_program (predict_word ):
280
276
# The declaration of 'next_word' must be after the invoking of inference_program,
281
277
# or the data input order of train program would be [next_word, firstw, secondw,
282
278
# thirdw, fourthw], which is not correct.
283
- predict_word = inference_program(is_sparse)
284
279
next_word = fluid.layers.data(name = ' nextw' , shape = [1 ], dtype = ' int64' )
285
280
cost = fluid.layers.cross_entropy(input = predict_word, label = next_word)
286
281
avg_cost = fluid.layers.mean(cost)
287
282
return avg_cost
288
- ```
289
-
290
- - 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集:` paddle.dataset.imikolov.train() ` 和` paddle.dataset.imikolov.test() ` 。两者都会返回一个读取器。在PaddlePaddle中,读取器是一个Python的函数,每次调用,会读取下一条数据。它是一个Python的generator。
291
283
292
- ` paddle.batch ` 会读入一个读取器,然后输出一个批次化了的读取器。` event_handler ` 亦可以一并传入` trainer.train ` 来时不时的输出每个步骤,批次的训练情况。
293
-
294
- ``` python
295
284
def optimizer_func ():
296
- # Note here we need to choose more sophisticated optimizers
297
- # such as AdaGrad with a decay rate. The normal SGD converges
298
- # very slowly.
299
- # optimizer=fluid.optimizer.SGD(learning_rate=0.001),
300
285
return fluid.optimizer.AdagradOptimizer(
301
286
learning_rate = 3e-3 ,
302
287
regularization = fluid.regularizer.L2DecayRegularizer(8e-4 ))
303
288
289
+ ```
290
+
291
+ - 现在我们可以开始训练啦。如今的版本较之以前就简单了许多。我们有现成的训练和测试集:` paddle.dataset.imikolov.train() ` 和` paddle.dataset.imikolov.test() ` 。两者都会返回一个读取器。在PaddlePaddle中,读取器是一个Python的函数,每次调用,会读取下一条数据。它是一个Python的generator。
292
+
293
+ ` paddle.batch ` 会读入一个读取器,然后输出一个批次化了的读取器。我们还可以在训练过程中输出每个步骤,批次的训练情况。
294
+
295
+ ``` python
296
+ def train (if_use_cuda , params_dirname , is_sparse = True ):
297
+ place = fluid.CUDAPlace(0 ) if if_use_cuda else fluid.CPUPlace()
304
298
305
- def train (use_cuda , train_program , params_dirname ):
306
299
train_reader = paddle.batch(
307
300
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE )
308
301
test_reader = paddle.batch(
309
302
paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE )
310
303
311
- place = fluid.CUDAPlace(0 ) if use_cuda else fluid.CPUPlace()
304
+ first_word = fluid.layers.data(name = ' firstw' , shape = [1 ], dtype = ' int64' )
305
+ second_word = fluid.layers.data(name = ' secondw' , shape = [1 ], dtype = ' int64' )
306
+ third_word = fluid.layers.data(name = ' thirdw' , shape = [1 ], dtype = ' int64' )
307
+ forth_word = fluid.layers.data(name = ' fourthw' , shape = [1 ], dtype = ' int64' )
308
+ next_word = fluid.layers.data(name = ' nextw' , shape = [1 ], dtype = ' int64' )
312
309
313
- def event_handler (event ):
314
- if isinstance (event, EndStepEvent):
315
- # We output cost every 10 steps.
316
- if event.step % 10 == 0 :
317
- outs = trainer.test(
318
- reader = test_reader,
319
- feed_order = [' firstw' , ' secondw' , ' thirdw' , ' fourthw' , ' nextw' ])
320
- avg_cost = outs[0 ]
321
-
322
- print (" Step %d : Average Cost %f " % (event.step, avg_cost))
323
-
324
- # If average cost is lower than 5.8, we consider the model good enough to stop.
325
- # Note 5.8 is a relatively high value. In order to get a better model, one should
326
- # aim for avg_cost lower than 3.5. But the training could take longer time.
327
- if avg_cost < 5.8 :
328
- trainer.save_params(params_dirname)
329
- trainer.stop()
330
-
331
- if math.isnan(avg_cost):
310
+ word_list = [first_word, second_word, third_word, forth_word, next_word]
311
+ feed_order = [' firstw' , ' secondw' , ' thirdw' , ' fourthw' , ' nextw' ]
312
+
313
+ main_program = fluid.default_main_program()
314
+ star_program = fluid.default_startup_program()
315
+
316
+ predict_word = inference_program(word_list, is_sparse)
317
+ avg_cost = train_program(predict_word)
318
+ test_program = main_program.clone(for_test = True )
319
+
320
+ sgd_optimizer = optimizer_func()
321
+ sgd_optimizer.minimize(avg_cost)
322
+
323
+ exe = fluid.Executor(place)
324
+
325
+ def train_test (program , reader ):
326
+ count = 0
327
+ feed_var_list = [
328
+ program.global_block().var(var_name) for var_name in feed_order
329
+ ]
330
+ feeder_test = fluid.DataFeeder(feed_list = feed_var_list, place = place)
331
+ test_exe = fluid.Executor(place)
332
+ accumulated = len ([avg_cost]) * [0 ]
333
+ for test_data in reader():
334
+ avg_cost_np = test_exe.run(
335
+ program = program,
336
+ feed = feeder_test.feed(test_data),
337
+ fetch_list = [avg_cost])
338
+ accumulated = [
339
+ x[0 ] + x[1 ][0 ] for x in zip (accumulated, avg_cost_np)
340
+ ]
341
+ count += 1
342
+ return [x / count for x in accumulated]
343
+
344
+ def train_loop ():
345
+ step = 0
346
+ feed_var_list_loop = [
347
+ main_program.global_block().var(var_name) for var_name in feed_order
348
+ ]
349
+ feeder = fluid.DataFeeder(feed_list = feed_var_list_loop, place = place)
350
+ exe.run(star_program)
351
+ for pass_id in range (PASS_NUM ):
352
+ for data in train_reader():
353
+ avg_cost_np = exe.run(
354
+ main_program, feed = feeder.feed(data), fetch_list = [avg_cost])
355
+
356
+ if step % 10 == 0 :
357
+ outs = train_test(test_program, test_reader)
358
+
359
+ print (" Step %d : Average Cost %f " % (step, outs[0 ]))
360
+
361
+ # it will take a few hours.
362
+ # If average cost is lower than 5.8, we consider the model good enough to stop.
363
+ # Note 5.8 is a relatively high value. In order to get a better model, one should
364
+ # aim for avg_cost lower than 3.5. But the training could take longer time.
365
+ if outs[0 ] < 5.8 :
366
+ if params_dirname is not None :
367
+ fluid.io.save_inference_model(params_dirname, [
368
+ ' firstw' , ' secondw' , ' thirdw' , ' fourthw'
369
+ ], [predict_word], exe)
370
+ return
371
+ step += 1
372
+ if math.isnan(float (avg_cost_np[0 ])):
332
373
sys.exit(" got NaN loss, training failed." )
333
374
334
- trainer = Trainer(
335
- train_func = train_program,
336
- optimizer_func = optimizer_func,
337
- place = place)
375
+ raise AssertionError (" Cost is too large {0:2.2 } " .format(avg_cost_np[0 ]))
338
376
339
- trainer.train(
340
- reader = train_reader,
341
- num_epochs = 1 ,
342
- event_handler = event_handler,
343
- feed_order = [' firstw' , ' secondw' , ' thirdw' , ' fourthw' , ' nextw' ])
377
+ train_loop()
344
378
```
345
379
346
- - ` trainer.train ` 将会开始训练。从 ` event_handler ` 返回的监控情况如下 :
380
+ - ` train_loop ` 将会开始训练。期间打印返回的监控情况如下 :
347
381
348
382
``` text
349
383
Step 0: Average Cost 7.337213
@@ -360,53 +394,75 @@ Step 20: Average Cost 5.766995
360
394
我们可以用我们训练过的模型,在得知之前的 N-gram 后,预测下一个词。
361
395
362
396
``` python
363
- def infer (use_cuda , inference_program , params_dirname = None ):
397
+ def infer (use_cuda , params_dirname = None ):
364
398
place = fluid.CUDAPlace(0 ) if use_cuda else fluid.CPUPlace()
365
- inferencer = Inferencer(
366
- infer_func = inference_program, param_path = params_dirname, place = place)
367
-
368
- # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
369
- # is simply an index to look up for the corresponding word vector and hence
370
- # the shape of word (base_shape) should be [1]. The length-based level of
371
- # detail (lod) info of each LoDtensor should be [[1]] meaning there is only
372
- # one lod_level and there is only one sequence of one word on this level.
373
- # Note that lod info should be a list of lists.
374
-
375
- data1 = [[211 ]] # 'among'
376
- data2 = [[6 ]] # 'a'
377
- data3 = [[96 ]] # 'group'
378
- data4 = [[4 ]] # 'of'
379
- lod = [[1 ]]
380
-
381
- first_word = fluid.create_lod_tensor(data1, lod, place)
382
- second_word = fluid.create_lod_tensor(data2, lod, place)
383
- third_word = fluid.create_lod_tensor(data3, lod, place)
384
- fourth_word = fluid.create_lod_tensor(data4, lod, place)
385
-
386
- result = inferencer.infer(
387
- {
388
- ' firstw' : first_word,
389
- ' secondw' : second_word,
390
- ' thirdw' : third_word,
391
- ' fourthw' : fourth_word
392
- },
393
- return_numpy = False )
394
-
395
- print (numpy.array(result[0 ]))
396
- most_possible_word_index = numpy.argmax(result[0 ])
397
- print (most_possible_word_index)
398
- print ([
399
- key for key, value in six.iteritems(word_dict)
400
- if value == most_possible_word_index
401
- ][0 ])
399
+
400
+ exe = fluid.Executor(place)
401
+
402
+ inference_scope = fluid.core.Scope()
403
+
404
+ inference_scope = fluid.core.Scope()
405
+ with fluid.scope_guard(inference_scope):
406
+ # Use fluid.io.load_inference_model to obtain the inference program desc,
407
+ # the feed_target_names (the names of variables that will be feeded
408
+ # data using feed operators), and the fetch_targets (variables that
409
+ # we want to obtain data from using fetch operators).
410
+ [inferencer, feed_target_names,
411
+ fetch_targets] = fluid.io.load_inference_model(params_dirname, exe)
412
+
413
+ # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word
414
+ # is simply an index to look up for the corresponding word vector and hence
415
+ # the shape of word (base_shape) should be [1]. The recursive_sequence_lengths,
416
+ # which is length-based level of detail (lod) of each LoDTensor, should be [[1]]
417
+ # meaning there is only one level of detail and there is only one sequence of
418
+ # one word on this level.
419
+ # Note that recursive_sequence_lengths should be a list of lists.
420
+ data1 = [[211 ]] # 'among'
421
+ data2 = [[6 ]] # 'a'
422
+ data3 = [[96 ]] # 'group'
423
+ data4 = [[4 ]] # 'of'
424
+ lod = [[1 ]]
425
+
426
+ first_word = fluid.create_lod_tensor(data1, lod, place)
427
+ second_word = fluid.create_lod_tensor(data2, lod, place)
428
+ third_word = fluid.create_lod_tensor(data3, lod, place)
429
+ fourth_word = fluid.create_lod_tensor(data4, lod, place)
430
+
431
+ assert feed_target_names[0 ] == ' firstw'
432
+ assert feed_target_names[1 ] == ' secondw'
433
+ assert feed_target_names[2 ] == ' thirdw'
434
+ assert feed_target_names[3 ] == ' fourthw'
435
+
436
+ # Construct feed as a dictionary of {feed_target_name: feed_target_data}
437
+ # and results will contain a list of data corresponding to fetch_targets.
438
+ results = exe.run(
439
+ inferencer,
440
+ feed = {
441
+ feed_target_names[0 ]: first_word,
442
+ feed_target_names[1 ]: second_word,
443
+ feed_target_names[2 ]: third_word,
444
+ feed_target_names[3 ]: fourth_word
445
+ },
446
+ fetch_list = fetch_targets,
447
+ return_numpy = False )
448
+
449
+ print (numpy.array(results[0 ]))
450
+ most_possible_word_index = numpy.argmax(results[0 ])
451
+ print (most_possible_word_index)
452
+ print ([
453
+ key for key, value in six.iteritems(word_dict)
454
+ if value == most_possible_word_index
455
+ ][0 ])
456
+
457
+
402
458
```
403
459
404
- 在经历3分钟的短暂训练后,我们得到如下的预测。 我们的模型预测 ` among a group of ` 的下一个词是` a ` 。这比较符合文法规律。如果我们训练时间更长,比如几个小时,那么我们会得到的下一个预测是 ` workers ` 。
460
+ 由于词向量矩阵本身比较稀疏,训练的过程如果要达到一定的精度耗时会比较长。为了能简单看到效果,教程只设置了经过很少的训练就结束并得到如下的预测。 我们的模型预测 ` among a group of ` 的下一个词是` the ` 。这比较符合文法规律。如果我们训练时间更长,比如几个小时,那么我们会得到的下一个预测是 ` workers ` 。
405
461
406
462
``` text
407
- [[0.00106646 0.0007907 0.00072041 ... 0.00049024 0.00041355 0.00084464 ]]
408
- 6
409
- a
463
+ [[0.03768077 0.03463154 0.00018074 ... 0.00022283 0.00029888 0.02967956 ]]
464
+ 0
465
+ the
410
466
```
411
467
412
468
整个程序的入口很简单:
@@ -419,14 +475,11 @@ def main(use_cuda, is_sparse):
419
475
params_dirname = " word2vec.inference.model"
420
476
421
477
train(
422
- use_cuda = use_cuda,
423
- train_program = partial(train_program, is_sparse),
424
- params_dirname = params_dirname)
425
-
426
- infer(
427
- use_cuda = use_cuda,
428
- inference_program = partial(inference_program, is_sparse),
429
- params_dirname = params_dirname)
478
+ if_use_cuda = use_cuda,
479
+ params_dirname = params_dirname,
480
+ is_sparse = is_sparse)
481
+
482
+ infer(use_cuda = use_cuda, params_dirname = params_dirname)
430
483
431
484
432
485
main(use_cuda = use_cuda, is_sparse = True )
0 commit comments