【Day15】AutoVC 实作 - Tensorflow 篇

资料前处理部分跟 Pytorch 篇一样，就不重复写了，这边只写 model 跟 Training 部分。

後来发现 keras 的 BatchNormalization 也训练得起来，就没有用自己的 BatchNormalization 训练了

Content-Encoder

def Encoder(input_shape,dim_neck = 32 , dim_emb=256 , freq = 22):
    initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))

    inp = Input(shape=input_shape)
    ### 要把 336 的那维变成 512
    ###
    x = tf.transpose(inp,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)     
    ###  
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x) 
    ###
    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x) 

    #########################
    #   对 176 那维做 LSTM  #
    #########################

    x = tf.transpose(x,(0,2,1))
    lstm_tf_1 =  LSTM(32,return_sequences = True)
    lstm_tf_2 = LSTM(32,return_sequences = True)

    x = Bidirectional(lstm_tf_1)(x)
    x = Bidirectional(lstm_tf_2)(x)

    ## 注意冒号，这里是做下采样
    x_up = x[:, :, :dim_neck]
    x_down = x[:, :,dim_neck:]
    codes = []

    for i in range(0, LEN_CROP, FREQ):
        codes.append(tf.concat((x_up[:,i+ freq-1,:],x_down[:,i,:]), axis=-1))


    return Model(inputs=inp, outputs = codes , name="content_encoder")

Decoder

def Decoder(encoder_input_shape):
    initializer = tf.keras.initializers.GlorotUniform(tf.sqrt(2.0))
    inputs = Input(shape = encoder_input_shape)

    # 进 LSTM 时 shape = (2,176,320)
    x = LSTM(512,return_sequences = True,kernel_initializer=initializer)(inputs)


    """
    3  个 5x1 Conv + BN + ReLU
    """

    x = Conv1D(512, kernel_size =5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size = 5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size = 5 , strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = ReLU()(x)

    x = tf.transpose(x,(0,2,1))

    #########################
    #   对 512 那维做 LSTM  #
    #########################
    x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)
    x = LSTM(1024,return_sequences = True,kernel_initializer=initializer)(x)

    """
    Linear
    """
    x = Dense(80)(x)

    return Model(inputs = inputs, outputs = x, name="decoder")

Decoder 的第二个输出

def Posnet(input_shape):
    initializer = tf.keras.initializers.GlorotUniform(tf.cast(5/3,tf.float32))
    liner = tf.keras.initializers.GlorotUniform(1)
    inp = Input(shape = input_shape)

    """
    这里是第二个输出

    要把 80 的那维变成 512

    """

    """
    4  个 5x1 Conv + BN + ReLU
    """

    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(inp)
    x = tf.transpose(x,(0,2,1))
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1)
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))

    x = BatchNormalization()(x)
    x = Activation("tanh")(x)

    x = tf.transpose(x,(0,2,1))
    x = Conv1D(512, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))

    x = BatchNormalization()(x)
    x = Activation("tanh")(x)


    x = tf.transpose(x,(0,2,1))
    x = Conv1D(80, kernel_size=5, strides=1,padding="same",dilation_rate=1,kernel_initializer=initializer)(x)
    x = tf.transpose(x,(0,2,1))

    x = BatchNormalization()(x)

    return Model(inputs = inp, outputs = x, name="posnet")

把上面三个组合起来，这边对应的是 model_vc 里的 Generator

class Autovc(tf.keras.Model):
    def __init__(self,dim_neck = 32,dim_emb=256,len_crop=176,freq=22):
        super(Autovc, self).__init__()
        self.encoder = Encoder((dim_emb+80,len_crop),dim_neck,dim_emb,freq)
        self.decoder = Decoder((len_crop,320))
        self.postnet = Posnet((len_crop,80))

    def call(self, inputs):

        x = inputs[0]
        c_org = inputs[1]
        c_trg = inputs[-1]
        batch_size = tf.shape(x)[0]

        x = tf.transpose(x,(0,2,1))
        c_org = tf.expand_dims(c_org, axis=1)
        c_org = tf.transpose(tf.broadcast_to(c_org,(tf.shape(c_org)[0],LEN_CROP,tf.shape(c_org)[-1])),(0,2,1))
        # concat 80 那维
        x = tf.concat([x, c_org],axis=1)

        codes = self.encoder(x)
        if c_trg is None:
            return tf.concat(codes,axis=-1)

        tmp = []
        for code in codes:
            tc = tf.expand_dims(code,axis=1)
            tmp.append(tf.broadcast_to(tc,(batch_size,int(LEN_CROP/len(codes)),64))) 
        code_exp = tf.concat(tmp, axis=1)

        c_trg =  tf.expand_dims(c_trg, axis=1)                       
        c_trg = tf.broadcast_to(c_trg,(batch_size,tf.shape(x)[-1],DIM_EMB))

        # concat 64 那维
        encoder_outputs = tf.concat((code_exp, c_trg), axis=-1)
        mel_outputs  = self.decoder(encoder_outputs)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = tf.transpose(mel_outputs_postnet,(0,2,1))
        mel_outputs_postnet = mel_outputs +  mel_outputs_postnet

        return mel_outputs, mel_outputs_postnet, tf.concat(codes, axis=-1)

Train-Loop

验证的话至少要训练 30,000 次才听得出来，在 2080Ti 上要跑约 6 小时，要听到不错的结果要训练约 1,000,000 次，loss 会收敛到 0.0001

非常神奇的是，Tensorflow 版的要 DIM_NECK 设 44 才会 work (32 的会生出不知道谁的声音)，目前原因不明，我在猜是计算的精度问题?

def train(step =30000 ,batch_size = 2):
    print(".....Strat.....")
    for j in range(step): 

        # 这里的跟之前 pytorch 那边一样
        # 确保输入跟 pytorch 版的一样
        try:
            x_real, emb_org = next(data_iter)
        except:
            data_iter = iter(vcc_loader)
            x_real, emb_org = next(data_iter)  
            
        # 因为输入资料是 torch tensor 记得要转回 np
        x_real = x_real.detach().cpu().numpy().astype(np.float32)
        emb_org = emb_org.detach().cpu().numpy().astype(np.float32) 

        # train_step 见下方
        g_loss_id, g_loss_id_psnt, g_loss_cd = train_step(x_real, emb_org,emb_org)


        if (j+1)%10 == 0:
            print(f"Step：{j}")
            print(f"G_loss_id：{g_loss_id}")
            print(f"G_loss_id_psnet：{g_loss_id_psnt}")
            print(f"G_loss_cd：{g_loss_cd}")
        if (j+2)%10 == 0:
            clear_output(wait=True)
        # 看你想什麽时候存，使用的时候就 autovc.encoder.load_weight("encoder_weights") 载入就好
        if (j+1)%10000 == 0:
            autovc.encoder.save_weights(f"model/encoder_weights_step_{j+1}.h5")
            autovc.decoder.save_weights(f"model/decoder_weights_step_{j+1}.h5")
            autovc.postnet.save_weights(f"model/postnet_weights_step_{j+1}.h5")

Train-Step

autovc_optimizer = tf.keras.optimizers.Adam(0.0001)

@tf.function
def train_step(x_real,emb_org,emb_trg):
    # tf.GradientTape() 等价於 loss.backward()

    with tf.GradientTape() as autovc_tape:
        x_identic, x_identic_psnt, code_real = autovc([x_real, emb_org, emb_trg])
        # loss 请参考昨天那篇
        g_loss_id = mse_loss(x_real, x_identic)
        g_loss_id_psnt = mse_loss(x_real, x_identic_psnt)

        code_reconst = autovc([x_identic_psnt, emb_org, None])

        g_loss_cd = l1_loss(code_real, code_reconst)
        g_loss = g_loss_id + g_loss_id_psnt + g_loss_cd


    gradients_of_autovc = autovc_tape.gradient(g_loss,autovc.trainable_variables)
    autovc_optimizer.apply_gradients(zip(gradients_of_autovc,autovc.trainable_variables))

    return g_loss_id, g_loss_id_psnt, g_loss_cd

最後就开始愉快的训练拉 ~~~

小结

到这边我们已经把 AutoVC 做过两遍了，TF 做出来的效果跟 Pytorch 的是一样的，只是 dim_neck 这个参数比较令人疑惑，为什麽 pytorch 的可以在 32 上成功但 TF 的不行，但两边在 freq = 22, dim_neck = 44 的情况下转出来的声音效果我听起来是差不多的。

10/1 号更新：
已经找到原因了，在上下采样的时候出了一些问题，现在解决了，两边都可以得到一样的结果。

10/2 号更新：
最新版的程序码重构完了，你可以在这里下载

接下的路

有了 model 之後就是要想办法能让它变得更好，像 LSTM 这边是有机会用 Transformer 来去取代它的，或是改变一下训练的方法之类的; (这次的铁人赛看到有几位邦友在专门写 Transformer 的介绍)，这边我想说改来分享一些 Gan 的音乐生成经验与音乐情绪反应的相关话题好了 XD，那声音转换的部分就先告一段落了，终於挺过一半了，大家继续加油！