@@ -64,7 +64,7 @@ def forward(self, input_ids, position_ids=None):
64
64
65
65
class DistilBertPretrainedModel (PretrainedModel ):
66
66
"""
67
- An abstract class for pretrained DistilBERT models. It provides DistilBERT related
67
+ An abstract class for pretrained DistilBert models. It provides DistilBert related
68
68
`model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
69
69
`pretrained_init_configuration`, `base_model_prefix` for downloading and
70
70
loading pretrained models. See `PretrainedModel` for more details.
@@ -131,6 +131,62 @@ def init_weights(self, layer):
131
131
132
132
@register_base_model
133
133
class DistilBertModel (DistilBertPretrainedModel ):
134
+ """
135
+ The bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.
136
+
137
+ This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
138
+ Refer to the superclass documentation for the generic methods.
139
+
140
+ This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
141
+ /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
142
+ and refer to the Paddle documentation for all matter related to general usage and behavior.
143
+
144
+ Args:
145
+ vocab_size (int):
146
+ Vocabulary size of `inputs_ids` in `DistilBertModel`. Defines the number of different tokens that can
147
+ be represented by the `inputs_ids` passed when calling `DistilBertModel`.
148
+ hidden_size (int, optional):
149
+ Dimensionality of the embedding layer, encoder layers and the pooler layer. Defaults to `768`.
150
+ num_hidden_layers (int, optional):
151
+ Number of hidden layers in the Transformer encoder. Defaults to `12`.
152
+ num_attention_heads (int, optional):
153
+ Number of attention heads for each attention layer in the Transformer encoder.
154
+ Defaults to `12`.
155
+ intermediate_size (int, optional):
156
+ Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
157
+ to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
158
+ and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
159
+ Defaults to `3072`.
160
+ hidden_act (str, optional):
161
+ The non-linear activation function in the feed-forward layer.
162
+ ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
163
+ are supported. Defaults to `"gelu"`.
164
+ hidden_dropout_prob (float, optional):
165
+ The dropout probability for all fully connected layers in the embeddings and encoder.
166
+ Defaults to `0.1`.
167
+ attention_probs_dropout_prob (float, optional):
168
+ The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
169
+ Defaults to `0.1`.
170
+ max_position_embeddings (int, optional):
171
+ The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
172
+ sequence. Defaults to `512`.
173
+ type_vocab_size (int, optional):
174
+ The vocabulary size of `token_type_ids`.
175
+ Defaults to `16`.
176
+ initializer_range (float, optional):
177
+ The standard deviation of the normal initializer.
178
+ Defaults to `0.02`.
179
+
180
+ .. note::
181
+ A normal_initializer initializes weight matrices as normal distributions.
182
+ See :meth:`DistilBertPretrainedModel.init_weights()` for how weights are initialized in `DistilBertModel`.
183
+
184
+ pad_token_id (int, optional):
185
+ The index of padding token in the token vocabulary.
186
+ Defaults to `0`.
187
+
188
+ """
189
+
134
190
def __init__ (self ,
135
191
vocab_size ,
136
192
hidden_size = 768 ,
@@ -162,6 +218,44 @@ def __init__(self,
162
218
self .apply (self .init_weights )
163
219
164
220
def forward (self , input_ids , attention_mask = None ):
221
+ r'''
222
+ The DistilBertModel forward method, overrides the `__call__()` special method.
223
+
224
+ Args:
225
+ input_ids (Tensor):
226
+ Indices of input sequence tokens in the vocabulary. They are
227
+ numerical representations of tokens that build the input sequence.
228
+ Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
229
+ attention_mask (Tensor, optional):
230
+ Mask used in multi-head attention to avoid performing attention to some unwanted positions,
231
+ usually the paddings or the subsequent positions.
232
+ Its data type can be int, float and bool.
233
+ When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
234
+ When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
235
+ When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
236
+ It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
237
+ For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
238
+ [batch_size, num_attention_heads, sequence_length, sequence_length].
239
+ Defaults to `None`, which means nothing needed to be prevented attention to.
240
+
241
+ Returns:
242
+ Tensor: Returns tensor `encoder_output`, which means the sequence of hidden-states at the last layer of the model.
243
+ Its data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
244
+
245
+ Example:
246
+ .. code-block::
247
+
248
+ import paddle
249
+ from paddlenlp.transformers import DistilBertModel, DistilBertTokenizer
250
+
251
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
252
+ model = DistilBertModel.from_pretrained('distilbert-base-uncased')
253
+
254
+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
255
+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
256
+ output = model(**inputs)
257
+ '''
258
+
165
259
if attention_mask is None :
166
260
attention_mask = paddle .unsqueeze (
167
261
(input_ids == self .pad_token_id
@@ -174,6 +268,21 @@ def forward(self, input_ids, attention_mask=None):
174
268
175
269
176
270
class DistilBertForSequenceClassification (DistilBertPretrainedModel ):
271
+ """
272
+ DistilBert Model with a linear layer on top of the output layer, designed for
273
+ sequence classification/regression tasks like GLUE tasks.
274
+
275
+ Args:
276
+ distilbert (:class:`DistilBertModel`):
277
+ An instance of DistilBertModel.
278
+ num_classes (int, optional):
279
+ The number of classes. Defaults to `2`.
280
+ dropout (float, optional):
281
+ The dropout probability for output of DistilBert.
282
+ If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
283
+ instance `distilbert`. Defaults to None.
284
+ """
285
+
177
286
def __init__ (self , distilbert , num_classes = 2 , dropout = None ):
178
287
super (DistilBertForSequenceClassification , self ).__init__ ()
179
288
self .num_classes = num_classes
@@ -188,6 +297,36 @@ def __init__(self, distilbert, num_classes=2, dropout=None):
188
297
self .apply (self .init_weights )
189
298
190
299
def forward (self , input_ids , attention_mask = None ):
300
+ r"""
301
+ The DistilBertForSequenceClassification forward method, overrides the __call__() special method.
302
+
303
+ Args:
304
+ input_ids (Tensor):
305
+ See :class:`DistilBertModel`.
306
+ attention_mask (list, optional):
307
+ See :class:`DistilBertModel`.
308
+
309
+ Returns:
310
+ Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
311
+ Shape as `[batch_size, num_classes]` and dtype as `float32`.
312
+
313
+ Example:
314
+ .. code-block::
315
+
316
+ import paddle
317
+ from paddlenlp.transformers.distilbert.modeling import DistilBertForSequenceClassification
318
+ from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
319
+
320
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
321
+ model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
322
+
323
+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
324
+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
325
+ outputs = model(**inputs)
326
+
327
+ logits = outputs[0]
328
+ """
329
+
191
330
distilbert_output = self .distilbert (
192
331
input_ids = input_ids , attention_mask = attention_mask )
193
332
@@ -202,6 +341,19 @@ def forward(self, input_ids, attention_mask=None):
202
341
203
342
204
343
class DistilBertForQuestionAnswering (DistilBertPretrainedModel ):
344
+ """
345
+ DistilBert Model with a linear layer on top of the hidden-states output to
346
+ compute `span_start_logits` and `span_end_logits`, designed for question-answering tasks like SQuAD.
347
+
348
+ Args:
349
+ distilbert (:class:`DistilBertModel`):
350
+ An instance of DistilBertModel.
351
+ dropout (float, optional):
352
+ The dropout probability for output of DistilBert.
353
+ If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
354
+ instance `distilbert`. Defaults to None.
355
+ """
356
+
205
357
def __init__ (self , distilbert , dropout = None ):
206
358
super (DistilBertForQuestionAnswering , self ).__init__ ()
207
359
self .distilbert = distilbert # allow bert to be config
@@ -211,6 +363,46 @@ def __init__(self, distilbert, dropout=None):
211
363
self .apply (self .init_weights )
212
364
213
365
def forward (self , input_ids , attention_mask = None ):
366
+ r"""
367
+ The DistilBertForQuestionAnswering forward method, overrides the __call__() special method.
368
+
369
+ Args:
370
+ input_ids (Tensor):
371
+ See :class:`DistilBertModel`.
372
+ attention_mask (list, optional):
373
+ See :class:`DistilBertModel`.
374
+
375
+ Returns:
376
+ tuple: Returns tuple (`start_logits`, `end_logits`).
377
+
378
+ With the fields:
379
+
380
+ - start_logits(Tensor):
381
+ A tensor of the input token classification logits, indicates the start position of the labelled span.
382
+ Its data type should be float32 and its shape is [batch_size, sequence_length].
383
+
384
+ - end_logits(Tensor):
385
+ A tensor of the input token classification logits, indicates the end position of the labelled span.
386
+ Its data type should be float32 and its shape is [batch_size, sequence_length].
387
+
388
+ Example:
389
+ .. code-block::
390
+
391
+ import paddle
392
+ from paddlenlp.transformers.distilbert.modeling import DistilBertForQuestionAnswering
393
+ from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
394
+
395
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
396
+ model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
397
+
398
+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
399
+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
400
+ outputs = model(**inputs)
401
+
402
+ start_logits = outputs[0]
403
+ end_logits =outputs[1]
404
+ """
405
+
214
406
sequence_output = self .distilbert (
215
407
input_ids , attention_mask = attention_mask )
216
408
sequence_output = self .dropout (sequence_output )
@@ -221,6 +413,21 @@ def forward(self, input_ids, attention_mask=None):
221
413
222
414
223
415
class DistilBertForTokenClassification (DistilBertPretrainedModel ):
416
+ """
417
+ DistilBert Model with a linear layer on top of the hidden-states output layer,
418
+ designed for token classification tasks like NER tasks.
419
+
420
+ Args:
421
+ distilbert (:class:`DistilBertModel`):
422
+ An instance of DistilBertModel.
423
+ num_classes (int, optional):
424
+ The number of classes. Defaults to `2`.
425
+ dropout (float, optional):
426
+ The dropout probability for output of DistilBert.
427
+ If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
428
+ instance `distilbert`. Defaults to None.
429
+ """
430
+
224
431
def __init__ (self , distilbert , num_classes = 2 , dropout = None ):
225
432
super (DistilBertForTokenClassification , self ).__init__ ()
226
433
self .num_classes = num_classes
@@ -232,6 +439,36 @@ def __init__(self, distilbert, num_classes=2, dropout=None):
232
439
self .apply (self .init_weights )
233
440
234
441
def forward (self , input_ids , attention_mask = None ):
442
+ r"""
443
+ The DistilBertForTokenClassification forward method, overrides the __call__() special method.
444
+
445
+ Args:
446
+ input_ids (Tensor):
447
+ See :class:`DistilBertModel`.
448
+ attention_mask (list, optional):
449
+ See :class:`DistilBertModel`.
450
+
451
+ Returns:
452
+ Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
453
+ Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
454
+
455
+ Example:
456
+ .. code-block::
457
+
458
+ import paddle
459
+ from paddlenlp.transformers.distilbert.modeling import DistilBertForTokenClassification
460
+ from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
461
+
462
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
463
+ model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
464
+
465
+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
466
+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
467
+ outputs = model(**inputs)
468
+
469
+ logits = outputs[0]
470
+ """
471
+
235
472
sequence_output = self .distilbert (
236
473
input_ids , attention_mask = attention_mask )
237
474
@@ -241,6 +478,14 @@ def forward(self, input_ids, attention_mask=None):
241
478
242
479
243
480
class DistilBertForMaskedLM (DistilBertPretrainedModel ):
481
+ """
482
+ DistilBert Model with a `language modeling` head on top.
483
+
484
+ Args:
485
+ distilbert (:class:`DistilBertModel`):
486
+ An instance of DistilBertModel.
487
+ """
488
+
244
489
def __init__ (self , distilbert ):
245
490
super (DistilBertForMaskedLM , self ).__init__ ()
246
491
self .distilbert = distilbert
@@ -255,6 +500,33 @@ def __init__(self, distilbert):
255
500
self .apply (self .init_weights )
256
501
257
502
def forward (self , input_ids = None , attention_mask = None ):
503
+ r'''
504
+ The DistilBertForMaskedLM forward method, overrides the `__call__()` special method.
505
+
506
+ Args:
507
+ input_ids (Tensor):
508
+ See :class:`DistilBertModel`.
509
+ attention_mask (Tensor, optional):
510
+ See :class:`DistilBertModel`.
511
+
512
+ Returns:
513
+ Tensor: Returns tensor `prediction_logits`, the scores of masked token prediction.
514
+ Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size].
515
+
516
+ Example:
517
+ .. code-block::
518
+
519
+ import paddle
520
+ from paddlenlp.transformers import DistilBertForMaskedLM, DistilBertTokenizer
521
+
522
+ tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
523
+ model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
524
+
525
+ inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
526
+ inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
527
+ prediction_logits = model(**inputs)
528
+ '''
529
+
258
530
distilbert_output = self .distilbert (
259
531
input_ids = input_ids , attention_mask = attention_mask )
260
532
prediction_logits = self .vocab_transform (distilbert_output )
0 commit comments