Skip to content

Commit 499cbe4

Browse files
authored
Modify Distilbert docstring (PaddlePaddle#949)
* modify distilbert * modify distilbert * modify nezha tokenizer * modify nezha * update * modify models * modify args * fix errors * modify args
1 parent 15693c7 commit 499cbe4

File tree

4 files changed

+968
-230
lines changed

4 files changed

+968
-230
lines changed

paddlenlp/transformers/distilbert/modeling.py

Lines changed: 273 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ def forward(self, input_ids, position_ids=None):
6464

6565
class DistilBertPretrainedModel(PretrainedModel):
6666
"""
67-
An abstract class for pretrained DistilBERT models. It provides DistilBERT related
67+
An abstract class for pretrained DistilBert models. It provides DistilBert related
6868
`model_config_file`, `resource_files_names`, `pretrained_resource_files_map`,
6969
`pretrained_init_configuration`, `base_model_prefix` for downloading and
7070
loading pretrained models. See `PretrainedModel` for more details.
@@ -131,6 +131,62 @@ def init_weights(self, layer):
131131

132132
@register_base_model
133133
class DistilBertModel(DistilBertPretrainedModel):
134+
"""
135+
The bare DistilBert Model transformer outputting raw hidden-states without any specific head on top.
136+
137+
This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
138+
Refer to the superclass documentation for the generic methods.
139+
140+
This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
141+
/docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
142+
and refer to the Paddle documentation for all matter related to general usage and behavior.
143+
144+
Args:
145+
vocab_size (int):
146+
Vocabulary size of `inputs_ids` in `DistilBertModel`. Defines the number of different tokens that can
147+
be represented by the `inputs_ids` passed when calling `DistilBertModel`.
148+
hidden_size (int, optional):
149+
Dimensionality of the embedding layer, encoder layers and the pooler layer. Defaults to `768`.
150+
num_hidden_layers (int, optional):
151+
Number of hidden layers in the Transformer encoder. Defaults to `12`.
152+
num_attention_heads (int, optional):
153+
Number of attention heads for each attention layer in the Transformer encoder.
154+
Defaults to `12`.
155+
intermediate_size (int, optional):
156+
Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
157+
to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
158+
and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
159+
Defaults to `3072`.
160+
hidden_act (str, optional):
161+
The non-linear activation function in the feed-forward layer.
162+
``"gelu"``, ``"relu"`` and any other paddle supported activation functions
163+
are supported. Defaults to `"gelu"`.
164+
hidden_dropout_prob (float, optional):
165+
The dropout probability for all fully connected layers in the embeddings and encoder.
166+
Defaults to `0.1`.
167+
attention_probs_dropout_prob (float, optional):
168+
The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
169+
Defaults to `0.1`.
170+
max_position_embeddings (int, optional):
171+
The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
172+
sequence. Defaults to `512`.
173+
type_vocab_size (int, optional):
174+
The vocabulary size of `token_type_ids`.
175+
Defaults to `16`.
176+
initializer_range (float, optional):
177+
The standard deviation of the normal initializer.
178+
Defaults to `0.02`.
179+
180+
.. note::
181+
A normal_initializer initializes weight matrices as normal distributions.
182+
See :meth:`DistilBertPretrainedModel.init_weights()` for how weights are initialized in `DistilBertModel`.
183+
184+
pad_token_id (int, optional):
185+
The index of padding token in the token vocabulary.
186+
Defaults to `0`.
187+
188+
"""
189+
134190
def __init__(self,
135191
vocab_size,
136192
hidden_size=768,
@@ -162,6 +218,44 @@ def __init__(self,
162218
self.apply(self.init_weights)
163219

164220
def forward(self, input_ids, attention_mask=None):
221+
r'''
222+
The DistilBertModel forward method, overrides the `__call__()` special method.
223+
224+
Args:
225+
input_ids (Tensor):
226+
Indices of input sequence tokens in the vocabulary. They are
227+
numerical representations of tokens that build the input sequence.
228+
Its data type should be `int64` and it has a shape of [batch_size, sequence_length].
229+
attention_mask (Tensor, optional):
230+
Mask used in multi-head attention to avoid performing attention to some unwanted positions,
231+
usually the paddings or the subsequent positions.
232+
Its data type can be int, float and bool.
233+
When the data type is bool, the `masked` tokens have `False` values and the others have `True` values.
234+
When the data type is int, the `masked` tokens have `0` values and the others have `1` values.
235+
When the data type is float, the `masked` tokens have `-INF` values and the others have `0` values.
236+
It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`.
237+
For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length],
238+
[batch_size, num_attention_heads, sequence_length, sequence_length].
239+
Defaults to `None`, which means nothing needed to be prevented attention to.
240+
241+
Returns:
242+
Tensor: Returns tensor `encoder_output`, which means the sequence of hidden-states at the last layer of the model.
243+
Its data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
244+
245+
Example:
246+
.. code-block::
247+
248+
import paddle
249+
from paddlenlp.transformers import DistilBertModel, DistilBertTokenizer
250+
251+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
252+
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
253+
254+
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
255+
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
256+
output = model(**inputs)
257+
'''
258+
165259
if attention_mask is None:
166260
attention_mask = paddle.unsqueeze(
167261
(input_ids == self.pad_token_id
@@ -174,6 +268,21 @@ def forward(self, input_ids, attention_mask=None):
174268

175269

176270
class DistilBertForSequenceClassification(DistilBertPretrainedModel):
271+
"""
272+
DistilBert Model with a linear layer on top of the output layer, designed for
273+
sequence classification/regression tasks like GLUE tasks.
274+
275+
Args:
276+
distilbert (:class:`DistilBertModel`):
277+
An instance of DistilBertModel.
278+
num_classes (int, optional):
279+
The number of classes. Defaults to `2`.
280+
dropout (float, optional):
281+
The dropout probability for output of DistilBert.
282+
If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
283+
instance `distilbert`. Defaults to None.
284+
"""
285+
177286
def __init__(self, distilbert, num_classes=2, dropout=None):
178287
super(DistilBertForSequenceClassification, self).__init__()
179288
self.num_classes = num_classes
@@ -188,6 +297,36 @@ def __init__(self, distilbert, num_classes=2, dropout=None):
188297
self.apply(self.init_weights)
189298

190299
def forward(self, input_ids, attention_mask=None):
300+
r"""
301+
The DistilBertForSequenceClassification forward method, overrides the __call__() special method.
302+
303+
Args:
304+
input_ids (Tensor):
305+
See :class:`DistilBertModel`.
306+
attention_mask (list, optional):
307+
See :class:`DistilBertModel`.
308+
309+
Returns:
310+
Tensor: Returns tensor `logits`, a tensor of the input text classification logits.
311+
Shape as `[batch_size, num_classes]` and dtype as `float32`.
312+
313+
Example:
314+
.. code-block::
315+
316+
import paddle
317+
from paddlenlp.transformers.distilbert.modeling import DistilBertForSequenceClassification
318+
from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
319+
320+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
321+
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
322+
323+
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
324+
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
325+
outputs = model(**inputs)
326+
327+
logits = outputs[0]
328+
"""
329+
191330
distilbert_output = self.distilbert(
192331
input_ids=input_ids, attention_mask=attention_mask)
193332

@@ -202,6 +341,19 @@ def forward(self, input_ids, attention_mask=None):
202341

203342

204343
class DistilBertForQuestionAnswering(DistilBertPretrainedModel):
344+
"""
345+
DistilBert Model with a linear layer on top of the hidden-states output to
346+
compute `span_start_logits` and `span_end_logits`, designed for question-answering tasks like SQuAD.
347+
348+
Args:
349+
distilbert (:class:`DistilBertModel`):
350+
An instance of DistilBertModel.
351+
dropout (float, optional):
352+
The dropout probability for output of DistilBert.
353+
If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
354+
instance `distilbert`. Defaults to None.
355+
"""
356+
205357
def __init__(self, distilbert, dropout=None):
206358
super(DistilBertForQuestionAnswering, self).__init__()
207359
self.distilbert = distilbert # allow bert to be config
@@ -211,6 +363,46 @@ def __init__(self, distilbert, dropout=None):
211363
self.apply(self.init_weights)
212364

213365
def forward(self, input_ids, attention_mask=None):
366+
r"""
367+
The DistilBertForQuestionAnswering forward method, overrides the __call__() special method.
368+
369+
Args:
370+
input_ids (Tensor):
371+
See :class:`DistilBertModel`.
372+
attention_mask (list, optional):
373+
See :class:`DistilBertModel`.
374+
375+
Returns:
376+
tuple: Returns tuple (`start_logits`, `end_logits`).
377+
378+
With the fields:
379+
380+
- start_logits(Tensor):
381+
A tensor of the input token classification logits, indicates the start position of the labelled span.
382+
Its data type should be float32 and its shape is [batch_size, sequence_length].
383+
384+
- end_logits(Tensor):
385+
A tensor of the input token classification logits, indicates the end position of the labelled span.
386+
Its data type should be float32 and its shape is [batch_size, sequence_length].
387+
388+
Example:
389+
.. code-block::
390+
391+
import paddle
392+
from paddlenlp.transformers.distilbert.modeling import DistilBertForQuestionAnswering
393+
from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
394+
395+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
396+
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
397+
398+
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
399+
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
400+
outputs = model(**inputs)
401+
402+
start_logits = outputs[0]
403+
end_logits =outputs[1]
404+
"""
405+
214406
sequence_output = self.distilbert(
215407
input_ids, attention_mask=attention_mask)
216408
sequence_output = self.dropout(sequence_output)
@@ -221,6 +413,21 @@ def forward(self, input_ids, attention_mask=None):
221413

222414

223415
class DistilBertForTokenClassification(DistilBertPretrainedModel):
416+
"""
417+
DistilBert Model with a linear layer on top of the hidden-states output layer,
418+
designed for token classification tasks like NER tasks.
419+
420+
Args:
421+
distilbert (:class:`DistilBertModel`):
422+
An instance of DistilBertModel.
423+
num_classes (int, optional):
424+
The number of classes. Defaults to `2`.
425+
dropout (float, optional):
426+
The dropout probability for output of DistilBert.
427+
If None, use the same value as `hidden_dropout_prob` of `DistilBertModel`
428+
instance `distilbert`. Defaults to None.
429+
"""
430+
224431
def __init__(self, distilbert, num_classes=2, dropout=None):
225432
super(DistilBertForTokenClassification, self).__init__()
226433
self.num_classes = num_classes
@@ -232,6 +439,36 @@ def __init__(self, distilbert, num_classes=2, dropout=None):
232439
self.apply(self.init_weights)
233440

234441
def forward(self, input_ids, attention_mask=None):
442+
r"""
443+
The DistilBertForTokenClassification forward method, overrides the __call__() special method.
444+
445+
Args:
446+
input_ids (Tensor):
447+
See :class:`DistilBertModel`.
448+
attention_mask (list, optional):
449+
See :class:`DistilBertModel`.
450+
451+
Returns:
452+
Tensor: Returns tensor `logits`, a tensor of the input token classification logits.
453+
Shape as `[batch_size, sequence_length, num_classes]` and dtype as `float32`.
454+
455+
Example:
456+
.. code-block::
457+
458+
import paddle
459+
from paddlenlp.transformers.distilbert.modeling import DistilBertForTokenClassification
460+
from paddlenlp.transformers.distilbert.tokenizer import DistilBertTokenizer
461+
462+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
463+
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased')
464+
465+
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
466+
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
467+
outputs = model(**inputs)
468+
469+
logits = outputs[0]
470+
"""
471+
235472
sequence_output = self.distilbert(
236473
input_ids, attention_mask=attention_mask)
237474

@@ -241,6 +478,14 @@ def forward(self, input_ids, attention_mask=None):
241478

242479

243480
class DistilBertForMaskedLM(DistilBertPretrainedModel):
481+
"""
482+
DistilBert Model with a `language modeling` head on top.
483+
484+
Args:
485+
distilbert (:class:`DistilBertModel`):
486+
An instance of DistilBertModel.
487+
"""
488+
244489
def __init__(self, distilbert):
245490
super(DistilBertForMaskedLM, self).__init__()
246491
self.distilbert = distilbert
@@ -255,6 +500,33 @@ def __init__(self, distilbert):
255500
self.apply(self.init_weights)
256501

257502
def forward(self, input_ids=None, attention_mask=None):
503+
r'''
504+
The DistilBertForMaskedLM forward method, overrides the `__call__()` special method.
505+
506+
Args:
507+
input_ids (Tensor):
508+
See :class:`DistilBertModel`.
509+
attention_mask (Tensor, optional):
510+
See :class:`DistilBertModel`.
511+
512+
Returns:
513+
Tensor: Returns tensor `prediction_logits`, the scores of masked token prediction.
514+
Its data type should be float32 and its shape is [batch_size, sequence_length, vocab_size].
515+
516+
Example:
517+
.. code-block::
518+
519+
import paddle
520+
from paddlenlp.transformers import DistilBertForMaskedLM, DistilBertTokenizer
521+
522+
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
523+
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
524+
525+
inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!")
526+
inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()}
527+
prediction_logits = model(**inputs)
528+
'''
529+
258530
distilbert_output = self.distilbert(
259531
input_ids=input_ids, attention_mask=attention_mask)
260532
prediction_logits = self.vocab_transform(distilbert_output)

paddlenlp/transformers/distilbert/tokenizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919

2020
class DistilBertTokenizer(BertTokenizer):
2121
"""
22-
Constructs a DistilBERT tokenizer. It uses a basic tokenizer to do punctuation
23-
splitting, lower casing and so on, and follows a WordPiece tokenizer to
24-
tokenize as subwords.
22+
Constructs a DistilBertTokenizer.
23+
The usage of DistilBertTokenizer is the same as
24+
`BertTokenizer <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.bert.tokenizer.html>`__.
2525
"""
2626
resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
2727
pretrained_resource_files_map = {

0 commit comments

Comments
 (0)