Day13 - 辨识模型 part2

model.py 会透过 DBiRNN class 来建构 CTC 模型，前一天中提到过我们是使用 LSTM 架构，也可以根据参数选择使用基本的 RNN 架构或是 GRU 架构。

build_multi_dynamic_brnn() function 就是在建立3层的双向 LSTM ，模型当中会加入 dropout 丢弃部分的神经元以避免模型在训练过程中 overfitting (根据 keep_prob 决定丢弃的比率)。

建立完 3 层的双向 LSTM 後面接着就是一层的 fully-connected 然後再经过 CTC (tf.nn.ctc_loss)计算得到输出序列。

# model.py
import argparse
import time
import datetime
import os
from six.moves import cPickle
from functools import wraps

import numpy as np
import tensorflow as tf
from tensorflow.contrib.rnn.python.ops import *
from tensorflow.python.ops.rnn import bidirectional_dynamic_rnn
from tensorflow.contrib import rnn

def dropout(x, keep_prob, is_training):
    return tf.contrib.layers.dropout(x, keep_prob=keep_prob, is_training=is_training)

def build_multi_dynamic_brnn(args,
                             maxTimeSteps,
                             inputX,
                             cell_fn,
                             seqLengths,
                             time_major=True):
    hid_input = inputX
    for i in range(args.num_layer):
        scope = 'DBRNN_' + str(i + 1)
        
        forward_cell = tf.contrib.rnn.LSTMCell(args.num_hidden,  forget_bias=1.0)
        
        backward_cell = tf.contrib.rnn.LSTMCell(args.num_hidden,  forget_bias=1.0)

        # tensor of shape: [max_timestamp, batch_size, input_size]
        outputs, output_states = bidirectional_dynamic_rnn(forward_cell, backward_cell,
                                                           inputs=hid_input,
                                                           dtype=tf.float32,
                                                           sequence_length=seqLengths,
                                                           time_major=True,
                                                           scope=scope)
        # forward output, backward ouput
        output_fw, output_bw = output
				# hidden state
        hidden = output_fw + output_bw
        # use dropout
        hidden = dropout(hidden, args.keep_prob, (args.mode == 'train'))
        
        if i != args.num_layer - 1:
            hid_input = hidden
        else:
            outputXrs = tf.reshape(hidden, [-1, args.num_hidden]) 
            output_list = tf.split(outputXrs, maxTimeSteps, 0)
            
            fbHrs = [tf.reshape(t, [args.batch_size, args.num_hidden]) for t in output_list]
            

    return fbHrs

class DBiRNN(object):
    def __init__(self, args, maxTimeSteps):
        self.args = args

        self.maxTimeSteps = maxTimeSteps
        if args.layerNormalization is True:
            if args.rnncell == 'rnn':
                self.cell_fn = lnBasicRNNCell
            elif args.rnncell == 'gru':
                self.cell_fn = lnGRUCell
            elif args.rnncell == 'lstm':
                self.cell_fn = lnBasicLSTMCell
            else:
                raise Exception("rnncell type not supported: {}".format(args.rnncell))
        else:
            if args.rnncell == 'rnn':
                self.cell_fn = tf.contrib.rnn.BasicRNNCell
            elif args.rnncell == 'gru':
                self.cell_fn = tf.contrib.rnn.GRUCell
            elif args.rnncell == 'lstm':
                self.cell_fn = tf.contrib.rnn.LSTMCell
            else:
                raise Exception("rnncell type not supported: {}".format(args.rnncell))

        self.build_graph(args, maxTimeSteps)

    def build_graph(self, args, maxTimeSteps):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.inputX = tf.placeholder(tf.float32,shape=(maxTimeSteps, args.batch_size, args.num_feature))  # [maxL,16,39]
            inputXrs = tf.reshape(self.inputX, [-1, args.num_feature])
            self.targetIxs = tf.placeholder(tf.int64)
            self.targetVals = tf.placeholder(tf.int32)
            self.targetShape = tf.placeholder(tf.int64)
            self.targetY = tf.SparseTensor(self.targetIxs, self.targetVals, self.targetShape)
            self.seqLengths = tf.placeholder(tf.int32, shape=(args.batch_size))

            
            self.config = {'name': args.model,
                           'rnncell': self.cell_fn,
                           'num_layer': args.num_layer,
                           'num_hidden': args.num_hidden,
                           'num_class': args.num_class,
                           'activation': args.activation,
                           'optimizer': args.optimizer,
                           'learning rate': args.learning_rate,
                           'keep prob': args.keep_prob,
                           'batch size': args.batch_size}

            fbHrs = build_multi_dynamic_brnn(self.args, maxTimeSteps, self.inputX, self.cell_fn, self.seqLengths)
            
            
            # fully connected
            with tf.name_scope('fc-layer'):
                with tf.variable_scope('fc'):
                    weightsClasses = tf.Variable(tf.truncated_normal([args.num_hidden, args.num_class]), name='weightsClasses')
                    biasesClasses = tf.Variable(tf.zeros([args.num_class]), name='biasesClasses')
                    logits = [tf.matmul(t, weightsClasses) + biasesClasses for t in fbHrs]
            
            
            logits3d = tf.stack(logits)
            
            self.var_op = tf.global_variables()
            self.var_trainable_op = tf.trainable_variables()
            
            self.loss = tf.reduce_mean(tf.nn.ctc_loss(self.targetY, logits3d, self.seqLengths))
            
            
            if args.grad_clip == -1:
                # not apply gradient clipping
                self.optimizer = tf.train.AdamOptimizer(args.learning_rate).minimize(self.loss)
            else:
                # apply gradient clipping
                grads, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.var_trainable_op), args.grad_clip)
                
                opti = tf.train.AdamOptimizer(args.learning_rate)
                self.optimizer = opti.apply_gradients(zip(grads, self.var_trainable_op))

            self.predictions = tf.to_int32(tf.nn.ctc_greedy_decoder(logits3d, self.seqLengths, merge_repeated=True)[0][0])

            if args.level == 'cha':
                self.errorRate = tf.reduce_sum(tf.edit_distance(self.predictions, self.targetY, normalize=True))

            self.initial_op = tf.global_variables_initializer()

            self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1, keep_checkpoint_every_n_hours=200)

介绍完了除噪模型与辨识模型，完整的模型架构如图 1:

图 1: 完整模型架构图