Skip to content

Commit

Permalink
Typo fixes (#1643)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jun 3, 2024
1 parent 42a97f6 commit b880622
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 15 deletions.
2 changes: 1 addition & 1 deletion egs/librispeech/ASR/zipformer/scaling.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def get_common_basis(self, p: "PiecewiseLinear", include_crossings: bool = False
p: the other piecewise linear function
include_crossings: if true, include in the x values positions
where the functions indicate by this and p crosss.
where the functions indicate by this and p cross.
"""
assert isinstance(p, PiecewiseLinear), type(p)

Expand Down
28 changes: 14 additions & 14 deletions egs/librispeech/ASR/zipformer/zipformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,9 +205,9 @@ def get_feature_masks(self, x: Tensor) -> Union[List[float], List[Tensor]]:
"""
In eval mode, returns [1.0] * num_encoders; in training mode, returns a number of
randomized feature masks, one per encoder.
On e.g. 15% of frames, these masks will zero out all enocder dims larger than
On e.g. 15% of frames, these masks will zero out all encoder dims larger than
some supplied number, e.g. >256, so in effect on those frames we are using
a smaller encoer dim.
a smaller encoder dim.
We generate the random masks at this level because we want the 2 masks to 'agree'
all the way up the encoder stack. This will mean that the 1st mask will have
Expand Down Expand Up @@ -548,9 +548,9 @@ class Zipformer2EncoderLayer(nn.Module):
Args:
embed_dim: the number of expected features in the input (required).
nhead: the number of heads in the multiheadattention models (required).
feedforward_dim: the dimension of the feedforward network model (default=2048).
feedforward_dim: the dimension of the feedforward network model (required).
dropout: the dropout value (default=0.1).
cnn_module_kernel (int): Kernel size of convolution module.
cnn_module_kernel (int): Kernel size of convolution module (default=31).
Examples::
>>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
Expand Down Expand Up @@ -1028,7 +1028,7 @@ def __init__(
)
self.num_layers = num_layers

assert 0 <= warmup_begin <= warmup_end
assert 0 <= warmup_begin <= warmup_end, (warmup_begin, warmup_end)

delta = (1.0 / num_layers) * (warmup_end - warmup_begin)
cur_begin = warmup_begin # interpreted as a training batch index
Expand Down Expand Up @@ -1177,7 +1177,7 @@ def __init__(
def _get_bypass_scale(self, batch_size: int):
# returns bypass-scale of shape (num_channels,),
# or (batch_size, num_channels,). This is actually the
# scale on the non-residual term, so 0 correponds to bypassing
# scale on the non-residual term, so 0 corresponds to bypassing
# this module.
if torch.jit.is_scripting() or torch.jit.is_tracing() or not self.training:
return self.bypass_scale
Expand Down Expand Up @@ -1381,12 +1381,12 @@ class CompactRelPositionalEncoding(torch.nn.Module):
when encoding absolute position, but not important when encoding relative position because there
is now no need to compare two large offsets with each other.
Our embedding works done by projecting the interval [-infinity,infinity] to a finite interval
using the atan() function, before doing the fourier transform of that fixed interval. The
Our embedding works by projecting the interval [-infinity,infinity] to a finite interval
using the atan() function, before doing the Fourier transform of that fixed interval. The
atan() function would compress the "long tails" too small,
making it hard to distinguish between different magnitudes of large offsets, so we use a logarithmic
function to compress large offsets to a smaller range before applying atan().
Scalings are chosen in such a way that the embedding can clearly distinguish invidual offsets as long
Scalings are chosen in such a way that the embedding can clearly distinguish individual offsets as long
as they are quite close to the origin, e.g. abs(offset) <= about sqrt(embedding_dim)
Expand All @@ -1408,10 +1408,10 @@ def __init__(
"""Construct a CompactRelPositionalEncoding object."""
super(CompactRelPositionalEncoding, self).__init__()
self.embed_dim = embed_dim
assert embed_dim % 2 == 0
assert embed_dim % 2 == 0, embed_dim
self.dropout = Dropout2(dropout_rate)
self.pe = None
assert length_factor >= 1.0
assert length_factor >= 1.0, length_factor
self.length_factor = length_factor
self.extend_pe(torch.tensor(0.0).expand(max_len))

Expand Down Expand Up @@ -1555,7 +1555,7 @@ def __init__(
# due to how Adam/ScaledAdam work, it can learn a fairly large nonzero
# bias because the small numerical roundoff tends to have a non-random
# sign. This module is intended to prevent that. Use a very small
# probability; that should be suffixient to fix the problem.
# probability; that should be sufficient to fix the problem.
self.balance_keys = Balancer(
key_head_dim * num_heads,
channel_dim=-1,
Expand All @@ -1571,7 +1571,7 @@ def __init__(
pos_dim, num_heads * pos_head_dim, bias=False, initial_scale=0.05
)

# the following are for diagnosics only, see --print-diagnostics option
# the following are for diagnostics only, see --print-diagnostics option
self.copy_pos_query = Identity()
self.copy_query = Identity()

Expand Down Expand Up @@ -1609,7 +1609,7 @@ def forward(
k = x[..., query_dim : 2 * query_dim]
# p is the position-encoding query
p = x[..., 2 * query_dim :]
assert p.shape[-1] == num_heads * pos_head_dim
assert p.shape[-1] == num_heads * pos_head_dim, (p.shape[-1], num_heads, pos_head_dim)

q = self.copy_query(q) # for diagnostics only, does nothing.
k = self.whiten_keys(self.balance_keys(k)) # does nothing in the forward pass.
Expand Down

0 comments on commit b880622

Please sign in to comment.