replace attention from https://github.com/dhlee347/pytorchic-bert

jinserk · jinserk · commit 5518fe73c060 · 2018-12-17T15:07:56.000-06:00
diff --git a/asr/models/las/network.py b/asr/models/las/network.py
@@ -177,12 +177,61 @@ def forward(self, s, h, len_mask=None):
         return c, a
 
 
+def split_last(x, shape):
+    "split the last dimension to given shape"
+    shape = list(shape)
+    assert shape.count(-1) <= 1
+    if -1 in shape:
+        shape[shape.index(-1)] = int(x.size(-1) / -np.prod(shape))
+    return x.view(*x.size()[:-1], *shape)
+
+
+def merge_last(x, n_dims):
+    "merge the last n_dims to a dimension"
+    s = x.size()
+    assert n_dims > 1 and n_dims < len(s)
+    return x.view(*s[:-n_dims], -1)
+
+
+class MultiHeadedSelfAttention(nn.Module):
+    """ Multi-Headed Dot Product Attention """
+    def __init__(self, state_vec_size, listen_vec_size, proj_hidden_size=512, num_heads=1, dropout=0.1):
+        super().__init__()
+        self.proj_q = nn.Linear(state_vec_size, proj_hidden_size)
+        self.proj_k = nn.Linear(listen_vec_size, proj_hidden_size)
+        self.proj_v = nn.Linear(listen_vec_size, proj_hidden_size)
+        self.drop = nn.Dropout(dropout)
+        self.scores = None # for visualization
+        self.n_heads = num_heads
+
+    def forward(self, q, k, mask):
+        """
+        x, q(query), k(key), v(value) : (B(batch_size), S(seq_len), D(dim))
+        mask : (B(batch_size) x S(seq_len))
+        * split D(dim) into (H(n_heads), W(width of head)) ; D = H * W
+        """
+        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
+        q, k, v = self.proj_q(q), self.proj_k(k), self.proj_v(k)
+        q, k, v = (split_last(x, (self.n_heads, -1)).transpose(1, 2) for x in [q, k, v])
+        # (B, H, S, W) @ (B, H, W, S) -> (B, H, S, S) -softmax-> (B, H, S, S)
+        scores = q @ k.transpose(-2, -1) / np.sqrt(k.size(-1))
+        if mask is not None:
+            mask = mask[:, None, None, :].float()
+            scores -= 10000.0 * (1.0 - mask)
+        scores = self.drop(F.softmax(scores, dim=-1))
+        # (B, H, S, S) @ (B, H, S, W) -> (B, H, S, W) -trans-> (B, S, H, W)
+        h = (scores @ v).transpose(1, 2).contiguous()
+        # -merge-> (B, S, D)
+        h = merge_last(h, 2)
+        self.scores = scores
+        return h
+
+
 class Speller(nn.Module):
 
     def __init__(self, listen_vec_size, label_vec_size, max_seq_lens=256, sos=None, eos=None,
                  rnn_type=nn.LSTM, rnn_hidden_size=512, rnn_num_layers=2,
-                 apply_attend_proj=False, proj_hidden_size=256, num_attend_heads=1,
-                 masked_attend=True):
+                 proj_hidden_size=256, num_attend_heads=1, masked_attend=True):
         super().__init__()
 
         assert sos is not None and 0 <= sos < label_vec_size
@@ -204,8 +253,7 @@ def __init__(self, listen_vec_size, label_vec_size, max_seq_lens=256, sos=None,
         self.norm = nn.LayerNorm(Hs, elementwise_affine=False)
 
         self.attention = Attention(state_vec_size=Hs, listen_vec_size=Hc,
-                                   apply_proj=apply_attend_proj, proj_hidden_size=proj_hidden_size,
-                                   num_heads=num_attend_heads)
+                                   proj_hidden_size=proj_hidden_size, num_heads=num_attend_heads)
 
         self.masked_attend = masked_attend
 
@@ -330,7 +378,7 @@ def forward(self, x):
 class ListenAttendSpell(nn.Module):
 
     def __init__(self, label_vec_size=p.NUM_CTC_LABELS, listen_vec_size=256,
-                 state_vec_size=256, num_attend_heads=1, input_folding=2, smoothing=0.001):
+                 state_vec_size=256, num_attend_heads=4, input_folding=2, smoothing=0.001):
         super().__init__()
 
         self.label_vec_size = label_vec_size + 2  # to add <sos>, <eos>
@@ -347,7 +395,7 @@ def __init__(self, label_vec_size=p.NUM_CTC_LABELS, listen_vec_size=256,
         self.spell = Speller(listen_vec_size=listen_vec_size, label_vec_size=self.label_vec_size,
                              sos=self.sos, eos=self.eos, max_seq_lens=256,
                              rnn_hidden_size=state_vec_size, rnn_num_layers=2,
-                             apply_attend_proj=True, proj_hidden_size=128, num_attend_heads=num_attend_heads)
+                             proj_hidden_size=256, num_attend_heads=num_attend_heads)
 
         self.attentions = None
         self.regions = None
diff --git a/asr/utils/logger.py b/asr/utils/logger.py
@@ -276,7 +276,7 @@ def plot_heatmap(ax, tensor, drawbox=None):
         else:
             fig, axs = plt.subplots(tensor.size(0), sharex=True)
             for i, ax in enumerate(axs):
-                plot_heatmap(ax, tensor[i], drawbox[i])
+                plot_heatmap(ax, tensor[i], drawbox)
             fig.subplots_adjust(hspace=2)
 
         fig.patch.set_color('white')