Skip to content

Commit

Permalink
Fix some docs what layerdrop does (#23691)
Browse files Browse the repository at this point in the history
* Fix some docs what layerdrop does

* Update src/transformers/models/data2vec/configuration_data2vec_audio.py

Co-authored-by: Sylvain Gugger <[email protected]>

* Fix more docs

---------

Co-authored-by: Sylvain Gugger <[email protected]>
  • Loading branch information
zspo and sgugger authored May 23, 2023
1 parent 357f281 commit 003a0cf
Show file tree
Hide file tree
Showing 36 changed files with 68 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def forward(
Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
loss: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification (or regression if config.num_labels==1) loss.
logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
Classification (or regression if config.num_labels==1) scores (before SoftMax).
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/generation/logits_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,7 +678,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more information.
Args:
prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`):
This function constraints the beam search to allowed tokens only at each step. This function takes 2
arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/modeling_outputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1522,7 +1522,7 @@ class Seq2SeqTSModelOutput(ModelOutput):
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
Scaling values of each time series' context window which is used to give the model inputs of the same
magnitude and then used to rescale back to the original magnitude.
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
Static features of each time series' in a batch which are copied to the covariates at inference time.
"""

Expand Down Expand Up @@ -1593,7 +1593,7 @@ class Seq2SeqTSPredictionOutput(ModelOutput):
scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*):
Scaling values of each time series' context window which is used to give the model inputs of the same
magnitude and then used to rescale back to the original magnitude.
static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*):
Static features of each time series' in a batch which are copied to the covariates at inference time.
"""

Expand Down
2 changes: 1 addition & 1 deletion src/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -912,7 +912,7 @@ def get_head_mask(
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
num_hidden_layers (`int`):
The number of hidden layers in the model.
is_attention_chunked: (`bool`, *optional*, defaults to `False`):
is_attention_chunked (`bool`, *optional*, defaults to `False`):
Whether or not the attentions scores are computed by chunks or not.
Returns:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/align/configuration_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ class AlignVisionConfig(PretrainedConfig):
List of output channel sizes to be used in each block for convolutional layers.
depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
List of block indices with square padding.
strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
List of stride sizes to be used in each block for convolutional layers.
num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
List of the number of times each block is to repeated.
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/blip/modeling_blip_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ def get_extended_attention_mask(
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
device: (`torch.device`):
device (`torch.device`):
The device of the input to the model.
Returns:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/blip/modeling_tf_blip_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ def get_extended_attention_mask(
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
is_decoder: (`bool`):
is_decoder (`bool`):
Whether the model is used as a decoder.
Returns:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/blip_2/modeling_blip_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1059,7 +1059,7 @@ def get_extended_attention_mask(
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (`Tuple[int]`):
The shape of the input to the model.
device: (`torch.device`):
device (`torch.device`):
The device of the input to the model.
Returns:
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/bloom/modeling_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
Merge heads together over the last dimenstion
Args:
x: (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
Returns:
torch.tensor: [batch_size, seq_length, num_heads * head_dim]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ class Data2VecAudioConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
final_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the final projection layer of [`Data2VecAudioForCTC`].
layerdrop (`float`, *optional*, defaults to 0.1):
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
details.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ class DeformableDetrConfig(PretrainedConfig):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
init_xavier_std (`float`, *optional*, defaults to 1):
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
auxiliary_loss (`bool`, *optional*, defaults to `False`):
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/deta/configuration_deta.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class DetaConfig(PretrainedConfig):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
init_xavier_std (`float`, *optional*, defaults to 1):
The scaling factor used for the Xavier initialization gain in the HM Attention map module.
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
auxiliary_loss (`bool`, *optional*, defaults to `False`):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ class EfficientNetConfig(PretrainedConfig):
List of output channel sizes to be used in each block for convolutional layers.
depthwise_padding (`List[int]`, *optional*, defaults to `[]`):
List of block indices with square padding.
strides: (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`):
List of stride sizes to be used in each block for convolutional layers.
num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`):
List of the number of times each block is to repeated.
Expand Down
3 changes: 3 additions & 0 deletions src/transformers/models/hubert/configuration_hubert.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ class HubertConfig(PretrainedConfig):
The dropout ratio for the attention probabilities.
final_dropout (`float`, *optional*, defaults to 0.1):
The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`].
layerdrop (`float`, *optional*, defaults to 0.1):
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
details.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
Expand Down
14 changes: 7 additions & 7 deletions src/transformers/models/lxmert/modeling_lxmert.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ class LxmertForQuestionAnsweringOutput(ModelOutput):
loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
Total loss as the sum of the masked language modeling loss and the next sequence prediction
(classification) loss.k.
question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`, *optional*):
Prediction scores of question answering objective (classification).
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
Expand Down Expand Up @@ -153,10 +153,10 @@ class LxmertForPreTrainingOutput(ModelOutput):
(classification) loss.
prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cross_relationship_score: (`torch.FloatTensor` of shape `(batch_size, 2)`):
cross_relationship_score (`torch.FloatTensor` of shape `(batch_size, 2)`):
Prediction scores of the textual matching objective (classification) head (scores of True/False
continuation before SoftMax).
question_answering_score: (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
question_answering_score (`torch.FloatTensor` of shape `(batch_size, n_qa_answers)`):
Prediction scores of question answering objective (classification).
language_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for input features + one for the output of each cross-modality layer) of
Expand Down Expand Up @@ -828,12 +828,12 @@ def _init_weights(self, module):
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
visual_feats: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
visual_feats (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
This input represents visual features. They ROI pooled object features from bounding boxes using a
faster-RCNN model)
These are currently not provided by the transformers library.
visual_pos: (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
visual_pos (`torch.FloatTensor` of shape `(batch_size, num_visual_features, visual_pos_dim)`):
This input represents spacial features corresponding to their relative (via index) visual features. The
pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
1.
Expand Down Expand Up @@ -1171,7 +1171,7 @@ def forward(
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
obj_labels: (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
obj_labels (`Dict[Str: Tuple[Torch.FloatTensor, Torch.FloatTensor]]`, *optional*):
each key is named after each one of the visual losses and each element of the tuple is of the shape
`(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
the label score respectively
Expand Down Expand Up @@ -1398,7 +1398,7 @@ def forward(
return_dict: Optional[bool] = None,
) -> Union[LxmertForQuestionAnsweringOutput, Tuple[torch.FloatTensor]]:
r"""
labels: (`Torch.Tensor` of shape `(batch_size)`, *optional*):
labels (`Torch.Tensor` of shape `(batch_size)`, *optional*):
A one-hot representation of the correct answer
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/lxmert/modeling_tf_lxmert.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@ class TFLxmertForPreTrainingOutput(ModelOutput):
(classification) loss.
prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cross_relationship_score: (`tf.Tensor` of shape `(batch_size, 2)`):
cross_relationship_score (`tf.Tensor` of shape `(batch_size, 2)`):
Prediction scores of the textual matching objective (classification) head (scores of True/False
continuation before SoftMax).
question_answering_score: (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
question_answering_score (`tf.Tensor` of shape `(batch_size, n_qa_answers)`):
Prediction scores of question answering objective (classification).
language_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `tf.Tensor` (one for input features + one for the output of each cross-modality layer) of shape
Expand Down Expand Up @@ -873,12 +873,12 @@ def serving(self, inputs):
[`PreTrainedTokenizer.encode`] for details.
[What are input IDs?](../glossary#input-ids)
visual_feats: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
visual_feats (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
This input represents visual features. They ROI pooled object features from bounding boxes using a
faster-RCNN model)
These are currently not provided by the transformers library.
visual_pos: (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
visual_pos (`tf.Tensor` of shape `(batch_size, num_visual_features, visual_feat_dim)`):
This input represents spacial features corresponding to their relative (via index) visual features. The
pre-trained LXMERT model expects these spacial features to be normalized bounding boxes on a scale of 0 to
1.
Expand Down Expand Up @@ -1297,7 +1297,7 @@ def call(
Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
obj_labels (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to `None`):
each key is named after each one of the visual losses and each element of the tuple is of the shape
`(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
the label score respectively
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/mask2former/modeling_mask2former.py
Original file line number Diff line number Diff line change
Expand Up @@ -1767,7 +1767,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
of the predicted mask for each query, instead of attending to the full feature map.
Args:
config: (`Mask2FormerConfig`):
config (`Mask2FormerConfig`):
Configuration used to instantiate Mask2FormerMaskedAttentionDecoder.
"""

Expand Down Expand Up @@ -2003,7 +2003,7 @@ def __init__(self, hidden_size: int, num_heads: int, mask_feature_size: torch.Te
The feature dimension of the Mask2FormerMaskedAttentionDecoder
num_heads (`int`):
The number of heads used in the Mask2FormerMaskedAttentionDecoder
mask_feature_size: (`torch.Tensor`):
mask_feature_size (`torch.Tensor`):
one of the output dimensions of the predicted masks for each query
"""
super().__init__()
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mpnet/tokenization_mpnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
This should likely be deactivated for Japanese (see this
[issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents: (`bool`, *optional*):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
"""
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/mpnet/tokenization_mpnet_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
issue](https://github.com/huggingface/transformers/issues/328)).
strip_accents: (`bool`, *optional*):
strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT).
"""
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/opt/configuration_opt.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class OPTConfig(PretrainedConfig):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
layerdrop: (`float`, *optional*, defaults to 0.0):
layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more
details.
init_std (`float`, *optional*, defaults to 0.02):
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/pegasus_x/configuration_pegasus_x.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,10 @@ class PegasusXConfig(PretrainedConfig):
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
use_cache (`bool`, *optional*, defaults to `True`):
Expand Down
Loading

0 comments on commit 003a0cf

Please sign in to comment.