先上个图方便施工,虽然内部有些操作图上上没有画出来
图片上的 Feed-Forward 写成 Dense,实际上他是长这样
def Create_feed_forward_network(d_model, dff):
# 此 FFN 对输入做两个线性转换,中间还加了一个 ReLU
return tf.keras.Sequential([
Dense(dff, activation='relu'), # (batch_size, seq_len, dff)
Dense(d_model) # (batch_size, seq_len, d_model)
])
一般会让 dff 这个参数大於 d_model,让 FFN 从输入的 d_model 维度里头学些有用的资讯。在论文中 d_model 为 512,dff 为 2048。两个都是可以调整的参数。
MultiHead 里最重要的就是要把 heads 给分离出来个别算,算完之後 concatenate 起来。
def split_heads(x, batch_size,num_head,depth):
x = tf.reshape(x, (batch_size,-1,num_heads,depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
def do_MultiHeadAttention(q,k,v,mask):
batch_size = tf.shape(q)[0]
# 图片上面没讲的是,q,k,v 都会做一次线性变换到 seq_len 维空间
q = Dense(seq_len)(q)
# (batch_size, num_heads, seq_len, depth)
q = split_heads(q, batch_size,num_head,depth)
...
... 同道里 For k 跟 v
...
# 昨天写过了
scaled_attention, attention_weights = do_attention(q, k, v, mask)
#把分头的结果接回来
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, seq_len))
# 最後还会通过一个线性变换,图片上也没画这个部分
output = Dense(concat_attention) # (batch_size, seq_len_q, d_model)
return output
我们的版本其实已经不是原始的方法了,是这篇论文的做法,source code 是 tf1.0 的,下面这个是我移植到 tf2.0 的,有验证过结果正确,我们音乐的 Transformer (但其实不是) 里头的 ATTN 用的是下面这种版本,他宣称是比较有效率的 ATTN,但我没有研究这篇,有兴趣的话你们可以参考看看。
class ATTN(tf.keras.layers.Layer):
def __init__(self, n_state, n_head, seq):
super(ATTN, self).__init__()
self.n_state = n_state * 3
self.n_head = n_head
E_initializer = tf.constant_initializer(0)
self.E = tf.Variable(
E_initializer(shape=[16, seq, 32], dtype=tf.float32), name="E"
)
def split_heads(self, x):
# From [batch, sequence, features] to [batch, heads, sequence, features]
return tf.transpose(self.split_states(x, self.n_head), [0, 2, 1, 3])
def split_states(self, x, n):
"""Reshape the last dimension of x into [n, x.shape[-1]/n]."""
*start, m = shape_list(x)
return tf.reshape(x, start + [n, m // n])
def merge_heads(self, x):
# Reverse of split_heads
return self.merge_states(tf.transpose(x, [0, 2, 1, 3]))
def mask_attn_weights(self, w):
# w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
_, _, nd, ns = shape_list(w)
b = self.attention_mask(nd, ns, dtype=w.dtype)
b = tf.reshape(b, [1, 1, nd, ns])
w = w * b - tf.cast(1e10, w.dtype) * (1 - b)
return w
def merge_states(self, x):
"""Smash the last two dimensions of x into a single dimension."""
*start, a, b = shape_list(x)
return tf.reshape(x, start + [a * b])
def attention_mask(self, nd, ns, *, dtype):
"""1's in the lower triangle, counting from the lower right corner.
Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
"""
i = tf.range(nd)[:, None]
j = tf.range(ns)
m = i >= j - ns + nd
return tf.cast(m, dtype)
def relative_attn(self, q):
# q have shape [batch, heads, sequence, features]
batch, heads, sequence, features = shape_list(q)
# [heads, batch, sequence, features]
q_ = tf.transpose(q, [1, 0, 2, 3])
# [heads, batch * sequence, features]
q_ = tf.reshape(q_, [heads, batch * sequence, features])
# [heads, batch * sequence, sequence]
rel = tf.matmul(q_, self.E, transpose_b=True)
# [heads, batch, sequence, sequence]
rel = tf.reshape(rel, [heads, batch, sequence, sequence])
# [heads, batch, sequence, 1+sequence]
rel = tf.pad(rel, ((0, 0), (0, 0), (0, 0), (1, 0)))
# [heads, batch, sequence+1, sequence]
rel = tf.reshape(rel, (heads, batch, sequence + 1, sequence))
# [heads, batch, sequence, sequence]
rel = rel[:, :, 1:]
# [batch, heads, sequence, sequence]
rel = tf.transpose(rel, [1, 0, 2, 3])
return rel
def multihead_attn(self, q, k, v):
# q, k, v have shape [batch, heads, sequence, features]
w = tf.matmul(q, k, transpose_b=True)
w = w + self.relative_attn(q)
w = w * tf.math.rsqrt(tf.cast(v.shape[-1], w.dtype))
w = self.mask_attn_weights(w)
w = tf.nn.softmax(w, axis=-1)
a = tf.matmul(w, v)
return a
def call(self, inputs):
q, k, v = map(self.split_heads, tf.split(inputs, 3, axis=2))
present = tf.stack([k, v], axis=1)
a = self.multihead_attn(q, k, v)
a = self.merge_heads(a)
return a, present
有了需要的 layer 之後,接下来就是叠叠乐时间了,按图施工保证成功R
# Encoder 里头会有 N 个 EncoderLayers,每个 EncoderLayer 里又有两个 sub-layers: MultiHeadAttention & feed_forward_network
# 先来组一份 EncoderLayer
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, dff, rate=0.1):
super(EncoderLayer, self).__init__()
self.ffn = Create_feed_forward_network(d_model, dff)
# layer norm 很常在 RNN-based 的模型被使用。一个 sub-layer 一个 layer norm
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
# 一个 sub-layer 一个 dropout layer
# Transformer 论文内预设 dropout rate 为 0.1
# 不怕 overfiting 的话你也可以不要用
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
# 丢入 "isTraining" 参数因为 dropout 在训练以及测试的行为不同
def call(self, x, mask,isTraining=True):
# 除了 `attn`,其他张量的 shape 皆为 (batch_size,input_seq_len, d_model)
# attn.shape == (batch_size, num_heads, input_seq_len, input_seq_len)
# Encoder 利用"自"注意机制,因此 q, k, v 全部都是自己
# 还需要 padding mask 来遮住输入序列中的 0 的地方
# 你也可以试试另一版本的 XD,或是参考老师的做法把他包成一个 layer
attn_output, attn = do_MultiHeadAttention(x,x,x,mask)
attn_output = self.dropout1(attn_output, training=isTrainig)
out1 = self.layernorm1(x + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=isTraining) # 记得 training
out2 = self.layernorm2(out1 + ffn_output)
return out2
最後加上 position encoding 跟 Embedding,Encoder 就炼成了
class Encoder(tf.keras.layers.Layer):
# Encoder 的初始参数除了本来就要给 EncoderLayer 的参数还多了:
# - num_layers: 决定要有几个 EncoderLayers, 前面影片中的 `N`
# - input_vocab_size: 用来把索引转成词嵌入向量
def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
rate=0.1):
super(Encoder, self).__init__()
# 这是长度不是 model
self.d_model = d_model
# Input 进来要先通过这里
self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
# 请见昨天
self.pos_encoding = positional_encoding(input_vocab_size, self.d_model)
# 建 N 个 EncoderLayers
self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(rate)
def call(self, x, training, mask):
# 输入的 x.shape == (batch_size, input_seq_len)
# 以下各 layer 的输出皆为 (batch_size, input_seq_len, d_model)
input_seq_len = tf.shape(x)[1]
# 将 2 维的索引序列转成 3 维的词嵌入张量,并依照论文乘上 sqrt(d_model)
# 再加上对应长度的 position encoding
x = self.embedding(x)
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x += self.pos_encoding[:, :input_seq_len, :]
# 对 embedding 与 position encoding 的总合做 regularization,在 Decoder 也会做
# 这地方有点神奇,它就只是想要对抗 overfitting 而已,做 regularization 跟 Dropout 都可以达成,图上也没有提到这个
x = self.dropout(x, training=training)
# 通过 N 个 EncoderLayer 做编码
for i, enc_layer in enumerate(self.enc_layers):
x = enc_layer(x, training, mask)
return x
最後分享一下我拿来训练音乐的架构
def TransformerGenerator(hparams, input_shape):
n_vocab = hparams["EventDim"]
n_embd = hparams["EmbeddingDim"]
n_layer = hparams["Layers"]
n_head = hparams["Heads"]
n_sequence = hparams["Time"]
batch_size = 1
inputs = Input(shape=input_shape, dtype=tf.float32)
# Feed-forword 用 CNN
h = dilated_causal_Conv1D(1,None,-1,-1)(inputs)
nx = 512
# 没加 position encoding
# N - layer Endcoer
for layer in range(n_layer):
## ATTN ###
nor = NormalizeDiagonal(n_embd)(h)
a = MyConvld(nx, nx * 3, [batch_size, n_sequence])(nor)
a, present = ATTN(nx, n_head,n_sequence)(a)
a = MyConvld(nx, nx, [batch_size, n_sequence])(a)
##########
h = Add()([h, a])
###########
## MLP ##
nor = NormalizeDiagonal(n_embd)(h)
a = MyConvld(nx, nx * 4, [batch_size, n_sequence])(nor)
a = Activation("gelu")(a)
m = MyConvld(nx * 4, nx, [batch_size, n_sequence])(a)
###########
h = Add()([h, m])
###########
### output ###
h = NormalizeDiagonal(n_embd)(h)
### back to 0~1
h = Dense(n_sequence)(h)
## 只是想实验看看 Transformer + Gru 会不会更厉害
h = GRU(256)(h)
h = Activation("sigmoid")(h)
h = Reshape((256,1))(h)
return Model(inputs, h)
完整的程序码你可以在这边找到
今天我们实作完了 Encoder 的架构,明天再来解决 Decoder 吧!
>>: Day 9 Prototype 制作 - 资讯配置与确认 (Adobe XD)
在 Search Console 的概述中,第一个项目是流量成效,第二个项目是涵盖范围的有效网页数,...
正文 今天要设定gitlab上的专案,让他们能够在git commit时自动打包成docker im...
不过,除了在参数传递路由之外,还有一种情况是在母路由之外,还有子路由需要被切换,这种状况我们称作巢状...
好久没有看到的小七...好像也没多久,昨天才见过,但因为昨晚的那个恐布经历,我现在好想看到她,好想听...
昨天用了D3的transition, 今天来试试看attrTween来让圆饼图长出来! 老样子先来看...