【Day24】 Transformer 实作包(一)

开始施工

本来是想分享自己参考网路作法再改写出来的 Transformer，但後来发现自己的架构并不是原本的 Transformer encoder (也没加 Positional Encoding)，只是纯粹弄了一个 MultiHeadAttention Layer 按自己意思乱接而已，这边还是以 Hung-yi Lee 老师的版本为主来做说明。

先上个图方便施工，虽然内部有些操作图上上没有画出来

Create Feed-Forward Networks

图片上的 Feed-Forward 写成 Dense，实际上他是长这样

def Create_feed_forward_network(d_model, dff):
  # 此 FFN 对输入做两个线性转换，中间还加了一个 ReLU 
  return tf.keras.Sequential([
      Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      Dense(d_model)                  # (batch_size, seq_len, d_model)
  ])

一般会让 dff 这个参数大於 d_model，让 FFN 从输入的 d_model 维度里头学些有用的资讯。在论文中 d_model 为 512，dff 为 2048。两个都是可以调整的参数。

Create MultiHeadAttention

MultiHead 里最重要的就是要把 heads 给分离出来个别算，算完之後 concatenate 起来。

def split_heads(x, batch_size,num_head,depth):
    x = tf.reshape(x, (batch_size,-1,num_heads,depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

def do_MultiHeadAttention(q,k,v,mask):
  batch_size = tf.shape(q)[0]
  #  图片上面没讲的是，q，ｋ，ｖ 都会做一次线性变换到 seq_len 维空间
  q = Dense(seq_len)(q)
  # (batch_size, num_heads, seq_len, depth)
  q = split_heads(q, batch_size,num_head,depth) 
  ...
  ... 同道里 For k 跟 v
  ...
  # 昨天写过了
  scaled_attention, attention_weights = do_attention(q, k, v, mask)
　＃把分头的结果接回来
  scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
  concat_attention = tf.reshape(scaled_attention, (batch_size, -1, seq_len)) 
  
  # 最後还会通过一个线性变换，图片上也没画这个部分
  output = Dense(concat_attention)  # (batch_size, seq_len_q, d_model)
  return output

我们的版本其实已经不是原始的方法了，是这篇论文的做法，source code 是 tf1.0 的，下面这个是我移植到 tf2.0 的，有验证过结果正确，我们音乐的 Transformer (但其实不是) 里头的 ATTN 用的是下面这种版本，他宣称是比较有效率的 ATTＮ，但我没有研究这篇，有兴趣的话你们可以参考看看。

class ATTN(tf.keras.layers.Layer):
    def __init__(self, n_state, n_head, seq):
        super(ATTN, self).__init__()
        self.n_state = n_state * 3
        self.n_head = n_head
        E_initializer = tf.constant_initializer(0)
        self.E = tf.Variable(
            E_initializer(shape=[16, seq, 32], dtype=tf.float32), name="E"
        )

    def split_heads(self, x):
        # From [batch, sequence, features] to [batch, heads, sequence, features]
        return tf.transpose(self.split_states(x, self.n_head), [0, 2, 1, 3])

    def split_states(self, x, n):
        """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
        *start, m = shape_list(x)
        return tf.reshape(x, start + [n, m // n])

    def merge_heads(self, x):
        # Reverse of split_heads
        return self.merge_states(tf.transpose(x, [0, 2, 1, 3]))

    def mask_attn_weights(self, w):
        # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
        _, _, nd, ns = shape_list(w)
        b = self.attention_mask(nd, ns, dtype=w.dtype)
        b = tf.reshape(b, [1, 1, nd, ns])
        w = w * b - tf.cast(1e10, w.dtype) * (1 - b)
        return w

    def merge_states(self, x):
        """Smash the last two dimensions of x into a single dimension."""
        *start, a, b = shape_list(x)
        return tf.reshape(x, start + [a * b])

    def attention_mask(self, nd, ns, *, dtype):
        """1's in the lower triangle, counting from the lower right corner.
        Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
        """
        i = tf.range(nd)[:, None]
        j = tf.range(ns)
        m = i >= j - ns + nd
        return tf.cast(m, dtype)

    def relative_attn(self, q):
        # q have shape [batch, heads, sequence, features]
        batch, heads, sequence, features = shape_list(q)
        # [heads, batch, sequence, features]
        q_ = tf.transpose(q, [1, 0, 2, 3])
        # [heads, batch * sequence, features]
        q_ = tf.reshape(q_, [heads, batch * sequence, features])
        # [heads, batch * sequence, sequence]
        rel = tf.matmul(q_, self.E, transpose_b=True)
        # [heads, batch, sequence, sequence]
        rel = tf.reshape(rel, [heads, batch, sequence, sequence])
        # [heads, batch, sequence, 1+sequence]
        rel = tf.pad(rel, ((0, 0), (0, 0), (0, 0), (1, 0)))
        # [heads, batch, sequence+1, sequence]
        rel = tf.reshape(rel, (heads, batch, sequence + 1, sequence))
        # [heads, batch, sequence, sequence]
        rel = rel[:, :, 1:]
        # [batch, heads, sequence, sequence]
        rel = tf.transpose(rel, [1, 0, 2, 3])
        return rel

    def multihead_attn(self, q, k, v):
        # q, k, v have shape [batch, heads, sequence, features]
        w = tf.matmul(q, k, transpose_b=True)
        w = w + self.relative_attn(q)
        w = w * tf.math.rsqrt(tf.cast(v.shape[-1], w.dtype))
        w = self.mask_attn_weights(w)
        w = tf.nn.softmax(w, axis=-1)
        a = tf.matmul(w, v)
        return a

    def call(self, inputs):
        q, k, v = map(self.split_heads, tf.split(inputs, 3, axis=2))
        present = tf.stack([k, v], axis=1)
        a = self.multihead_attn(q, k, v)
        a = self.merge_heads(a)
        return a, present

Create EncoderLayer

有了需要的 layer 之後，接下来就是叠叠乐时间了，按图施工保证成功R

# Encoder 里头会有 N 个 EncoderLayers，每个 EncoderLayer 里又有两个 sub-layers: MultiHeadAttention & feed_forward_network
# 先来组一份 EncoderLayer
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.ffn = Create_feed_forward_network(d_model, dff)   
    # layer norm 很常在 RNN-based 的模型被使用。一个 sub-layer 一个 layer norm
    self.layernorm1 = LayerNormalization(epsilon=1e-6)
    self.layernorm2 = LayerNormalization(epsilon=1e-6)
    
    # 一个 sub-layer 一个 dropout layer
    # Transformer 论文内预设 dropout rate 为 0.1
    # 不怕 overfiting 的话你也可以不要用
    self.dropout1 = Dropout(rate)
    self.dropout2 = Dropout(rate)

  # 丢入 "isTraining" 参数因为 dropout 在训练以及测试的行为不同
  def call(self, x, mask,isTraining=True):
    # 除了 `attn`，其他张量的 shape 皆为 (batch_size,input_seq_len, d_model)
    # attn.shape == (batch_size, num_heads, input_seq_len, input_seq_len)

    # Encoder 利用"自"注意机制，因此 q, k, v 全部都是自己
    
    # 还需要 padding mask 来遮住输入序列中的 0 的地方  
    # 你也可以试试另一版本的 XD，或是参考老师的做法把他包成一个 layer
    attn_output, attn = do_MultiHeadAttention(x,x,x,mask)
    attn_output = self.dropout1(attn_output, training=isTrainig) 
    out1 = self.layernorm1(x + attn_output)  
    ffn_output = self.ffn(out1) 
    ffn_output = self.dropout2(ffn_output, training=isTraining)  # 记得 training
    out2 = self.layernorm2(out1 + ffn_output)
    return out2

Create Encoder

最後加上 position encoding 跟 Embedding，Encoder 就炼成了

class Encoder(tf.keras.layers.Layer):
  # Encoder 的初始参数除了本来就要给 EncoderLayer 的参数还多了：
  # - num_layers: 决定要有几个 EncoderLayers, 前面影片中的 `N`
  # - input_vocab_size: 用来把索引转成词嵌入向量
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               rate=0.1):
    super(Encoder, self).__init__()
    # 这是长度不是 model
    self.d_model = d_model
    # Input 进来要先通过这里
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    # 请见昨天
    self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)

    # 建 N 个 EncoderLayers
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):
    # 输入的 x.shape == (batch_size, input_seq_len)
    # 以下各 layer 的输出皆为 (batch_size, input_seq_len, d_model)
    input_seq_len = tf.shape(x)[1]

    # 将 2 维的索引序列转成 3 维的词嵌入张量，并依照论文乘上 sqrt(d_model)
    # 再加上对应长度的 position encoding
    x = self.embedding(x)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :input_seq_len, :]

    # 对 embedding 与 position encoding 的总合做 regularization,在 Decoder 也会做
    # 这地方有点神奇，它就只是想要对抗 overfitting 而已，做 regularization 跟 Dropout 都可以达成，图上也没有提到这个
    x = self.dropout(x, training=training)

    # 通过 N 个 EncoderLayer 做编码
    for i, enc_layer in enumerate(self.enc_layers):
      x = enc_layer(x, training, mask)

    return x

最後分享一下我拿来训练音乐的架构

def TransformerGenerator(hparams, input_shape):

    n_vocab = hparams["EventDim"]
    n_embd = hparams["EmbeddingDim"]
    n_layer = hparams["Layers"]
    n_head = hparams["Heads"]
    n_sequence = hparams["Time"]

    batch_size = 1
    inputs = Input(shape=input_shape, dtype=tf.float32)

    # Feed-forword 用 CNN 
    h = dilated_causal_Conv1D(1,None,-1,-1)(inputs)
    nx = 512
    # 没加 position encoding
    # N - layer Endcoer
    for layer in range(n_layer):
        ## ATTN ###
        nor = NormalizeDiagonal(n_embd)(h)
        a = MyConvld(nx, nx * 3, [batch_size, n_sequence])(nor)
        a, present = ATTN(nx, n_head,n_sequence)(a)
        a = MyConvld(nx, nx, [batch_size, n_sequence])(a)
        ##########
        h = Add()([h, a])
        ###########
        ##  MLP  ##
        nor = NormalizeDiagonal(n_embd)(h)
        a = MyConvld(nx, nx * 4, [batch_size, n_sequence])(nor)
        a = Activation("gelu")(a)
        m = MyConvld(nx * 4, nx, [batch_size, n_sequence])(a)
        ###########
        h = Add()([h, m])
        ###########

    ### output ###
    h = NormalizeDiagonal(n_embd)(h)
    ### back to 0~1
    h = Dense(n_sequence)(h)
    ## 只是想实验看看 Transformer + Gru 会不会更厉害
    h = GRU(256)(h)    
    h = Activation("sigmoid")(h)
    h = Reshape((256,1))(h)
    return Model(inputs, h)

完整的程序码你可以在这边找到