序列到序列模型成功应用在机器翻译、语音识别、文本摘要和生成式对话,很多从一个序列到另一个序列的场景都可以通过此模型架构来实现,比如文本转语音等。简单来说,seq2seq是通过一个RNN(称作编码器)来对一个输入序列进行编码,然后通过另一个RNN(称作解码器)来对输出序列进行解码,编码器的最终状态作为解码器的初始输入状态,具体的原理请参考下面的博客。
基础版RNN实现:基于Tensorflow,使用原始的python代码实现一个简单的RNN版本
动态RNN和静态RNN比较:通过实现动态RNN和静态RNN在莎士比亚小说数据集上的训练时间比较,发现动态RNN的效率比静态RNN的效率高,文章还实现一个通过继承tensorflow RNN基础单元的自定义RNN。
不同长度序列的处理方法:将短的序列使用
通过RNN实现序列到序列模型的指南:编码器通过RNN將输入序列编码成一个状态向量,解码器将状态向量当作初始的输入生成第一个输出,第一个输出作为输入生成第二个输出,以此类推。
GOOGLE神经翻译机模型:通过序列到序列模型实现机器翻译
seq2seq实现文本摘要:谷歌AI研究博客,通过seq2seq加上注意力实现文本摘要,输入文本为文章的开头两句
自动文摘:seq2seq在自动文摘任务中的应用
本来通过对tensorflow官网seq2seq实现的神经翻译模型代码进行注释,学习tensorflow最新版本的seq2seq的api,最新版本的api比1.3版本之前的api清楚,在推断的时候也增加来beamsearch和sample的解码器。
1、加载和准备数据预处理
import os
import tensorflow as tf
import collections
train_data_en='./nmt/nmt/nmt_data/train.en'
train_data_vi='./nmt/nmt/nmt_data/train.vi'
vocab_en='./nmt/nmt/nmt_data/vocab.en'
vocab_vi='./nmt/nmt/nmt_data/vocab.vi'
#加载数据函数
def load_data(path):
input_file = os.path.join(path)
with open(input_file, "r") as f:
data = f.readlines()
return data
1.1、查看各个数据的格式
load_data(train_data_en)[:3]
load_data(train_data_vi)[:3]
load_data(vocab_en)[:10]
load_data(vocab_vi)[:10]
1.2、创建字典表
UNK = "<unk>"
SOS = "<s>"
EOS = "</s>"
UNK_ID = 0
def create_vocab_tables(src_vocab_file, tgt_vocab_file, share_vocab):
"""Creates vocab tables for src_vocab_file and tgt_vocab_file."""
#从文件字面加载单词列表并建立索引
src_vocab_table = tf.contrib.lookup.index_table_from_file(
src_vocab_file, default_value=UNK_ID)
if share_vocab:
tgt_vocab_table = src_vocab_table
else:
tgt_vocab_table = tf.contrib.lookup.index_table_from_file(
tgt_vocab_file, default_value=UNK_ID)
return src_vocab_table, tgt_vocab_table
sess=tf.InteractiveSession()
src_vocab_table,tgt_vocab_table=create_vocab_tables(vocab_vi,vocab_en,False)
features = tf.constant(["<unk>", "The", "science", "behind"])
ids = tgt_vocab_table.lookup(features)
tf.tables_initializer().run()
ids.eval()
1.3、文本预处理
#定义一个结构体BatchedInput,用于数据返回的封装
class BatchedInput(
collections.namedtuple("BatchedInput",
("initializer", "source", "target_input",
"target_output", "source_sequence_length",
"target_sequence_length"))):
pass
def get_iterator(src_dataset,
tgt_dataset,
src_vocab_table,
tgt_vocab_table,
batch_size,
sos,
eos,
random_seed,
num_buckets,
src_max_len=None,
tgt_max_len=None,
num_parallel_calls=4,
output_buffer_size=None,
skip_count=None,
num_shards=1,
shard_index=0,
reshuffle_each_iteration=True):
"""
src_dataset:源数据集
tgt_dataset:目标数据集
src_vocab_table:源字典表
tgt_vocab_table:目标字典表
batch_size:批次大小
sos:句子开始字符
eos:句子结束字符
"""
if not output_buffer_size:
output_buffer_size = batch_size * 1000
src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32)
tgt_sos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(sos)), tf.int32)
tgt_eos_id = tf.cast(tgt_vocab_table.lookup(tf.constant(eos)), tf.int32)
#Dataset.zip的功能和python的内置功能zip类似
src_tgt_dataset = tf.data.Dataset.zip((src_dataset, tgt_dataset))
#对数据集分片
src_tgt_dataset = src_tgt_dataset.shard(num_shards, shard_index)
if skip_count is not None:
src_tgt_dataset = src_tgt_dataset.skip(skip_count)
#对数据重排序
src_tgt_dataset = src_tgt_dataset.shuffle(
output_buffer_size, random_seed, reshuffle_each_iteration)
#tf.string_split对句子按照默认空格分词
#prefetch创建预取元素的数据集
src_tgt_dataset = src_tgt_dataset.map(
lambda src, tgt: (
tf.string_split([src]).values, tf.string_split([tgt]).values),
num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
# Filter zero length input sequences.
src_tgt_dataset = src_tgt_dataset.filter(
lambda src, tgt: tf.logical_and(tf.size(src) > 0, tf.size(tgt) > 0))
if src_max_len:
src_tgt_dataset = src_tgt_dataset.map(
lambda src, tgt: (src[:src_max_len], tgt),
num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
if tgt_max_len:
src_tgt_dataset = src_tgt_dataset.map(
lambda src, tgt: (src, tgt[:tgt_max_len]),
num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
# Convert the word strings to ids. Word strings that are not in the
# vocab get the lookup table's default_value integer.
#将单词转换为对应的ID
src_tgt_dataset = src_tgt_dataset.map(
lambda src, tgt: (tf.cast(src_vocab_table.lookup(src), tf.int32),
tf.cast(tgt_vocab_table.lookup(tgt), tf.int32)),
num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
# Create a tgt_input prefixed with <sos> and a tgt_output suffixed with <eos>.
src_tgt_dataset = src_tgt_dataset.map(
lambda src, tgt: (src,
tf.concat(([tgt_sos_id], tgt), 0),
tf.concat((tgt, [tgt_eos_id]), 0)),
num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
# Add in sequence lengths.
src_tgt_dataset = src_tgt_dataset.map(
lambda src, tgt_in, tgt_out: (
src, tgt_in, tgt_out, tf.size(src), tf.size(tgt_in)),
num_parallel_calls=num_parallel_calls).prefetch(output_buffer_size)
# Bucket by source sequence length (buckets for lengths 0-9, 10-19, ...)
def batching_func(x):
return x.padded_batch(
batch_size,
# The first three entries are the source and target line rows;
# these have unknown-length vectors. The last two entries are
# the source and target row sizes; these are scalars.
padded_shapes=(
tf.TensorShape([None]), # src
tf.TensorShape([None]), # tgt_input
tf.TensorShape([None]), # tgt_output
tf.TensorShape([]), # src_len
tf.TensorShape([])), # tgt_len
# Pad the source and target sequences with eos tokens.
# (Though notice we don't generally need to do this since
# later on we will be masking out calculations past the true sequence.
padding_values=(
src_eos_id, # src
tgt_eos_id, # tgt_input
tgt_eos_id, # tgt_output
0, # src_len -- unused
0)) # tgt_len -- unused
if num_buckets > 1:
def key_func(unused_1, unused_2, unused_3, src_len, tgt_len):
# Calculate bucket_width by maximum source sequence length.
# Pairs with length [0, bucket_width) go to bucket 0, length
# [bucket_width, 2 * bucket_width) go to bucket 1, etc. Pairs with length
# over ((num_bucket-1) * bucket_width) words all go into the last bucket.
if src_max_len:
bucket_width = (src_max_len + num_buckets - 1) // num_buckets
else:
bucket_width = 10
# Bucket sentence pairs by the length of their source sentence and target
# sentence.
bucket_id = tf.maximum(src_len // bucket_width, tgt_len // bucket_width)
return tf.to_int64(tf.minimum(num_buckets, bucket_id))
def reduce_func(unused_key, windowed_data):
return batching_func(windowed_data)
batched_dataset = src_tgt_dataset.apply(
tf.contrib.data.group_by_window(
key_func=key_func, reduce_func=reduce_func, window_size=batch_size))
else:
batched_dataset = batching_func(src_tgt_dataset)
batched_iter = batched_dataset.make_initializable_iterator()
(src_ids, tgt_input_ids, tgt_output_ids, src_seq_len,
tgt_seq_len) = (batched_iter.get_next())
return BatchedInput(
initializer=batched_iter.initializer,
source=src_ids,
target_input=tgt_input_ids,
target_output=tgt_output_ids,
source_sequence_length=src_seq_len,
target_sequence_length=tgt_seq_len)
2、定义模型
2.1、BaseModel为基础模型,后面模型的父类
VOCAB_SIZE_THRESHOLD_CPU = 50000
def _get_embed_device(vocab_size):
"""Decide on which device to place an embed matrix given its vocab size."""
if vocab_size > VOCAB_SIZE_THRESHOLD_CPU:
return "/cpu:0"
else:
return "/gpu:0"
#参数初始化方法
def get_initializer(init_op, seed=None, init_weight=None):
"""Create an initializer. init_weight is only for uniform."""
if init_op == "uniform":
assert init_weight
return tf.random_uniform_initializer(
-init_weight, init_weight, seed=seed)
elif init_op == "glorot_normal":
return tf.keras.initializers.glorot_normal(
seed=seed)
elif init_op == "glorot_uniform":
return tf.keras.initializers.glorot_uniform(
seed=seed)
else:
raise ValueError("Unknown init_op %s" % init_op)
#梯度裁剪
def gradient_clip(gradients, max_gradient_norm):
"""Clipping gradients of a model."""
clipped_gradients, gradient_norm = tf.clip_by_global_norm(
gradients, max_gradient_norm)
gradient_norm_summary = [tf.summary.scalar("grad_norm", gradient_norm)]
gradient_norm_summary.append(
tf.summary.scalar("clipped_gradient", tf.global_norm(clipped_gradients)))
return clipped_gradients, gradient_norm_summary, gradient_norm
import codecs
import os
#加载词嵌入的词列表
def load_vocab(vocab_file):
vocab = []
with codecs.getreader("utf-8")(tf.gfile.GFile(vocab_file, "rb")) as f:
vocab_size = 0
for word in f:
vocab_size += 1
vocab.append(word.strip())
return vocab, vocab_size
#加载词嵌入的向量列表
def load_embed_txt(embed_file):
"""Load embed_file into a python dictionary.
Note: the embed_file should be a Glove formated txt file. Assuming
embed_size=5, for example:
the -0.071549 0.093459 0.023738 -0.090339 0.056123
to 0.57346 0.5417 -0.23477 -0.3624 0.4037
and 0.20327 0.47348 0.050877 0.002103 0.060547
Args:
embed_file: file path to the embedding file.
Returns:
a dictionary that maps word to vector, and the size of embedding dimensions.
"""
emb_dict = dict()
emb_size = None
with codecs.getreader("utf-8")(tf.gfile.GFile(embed_file, 'rb')) as f:
for line in f:
tokens = line.strip().split(" ")
word = tokens[0]
vec = list(map(float, tokens[1:]))
emb_dict[word] = vec
if emb_size:
assert emb_size == len(vec), "All embedding size should be same."
else:
emb_size = len(vec)
return emb_dict, emb_size
#加载预先训练的词嵌入列表
def _create_pretrained_emb_from_txt(
vocab_file, embed_file, num_trainable_tokens=3, dtype=tf.float32,
scope=None):
"""Load pretrain embeding from embed_file, and return an embedding matrix.
Args:
embed_file: Path to a Glove formated embedding txt file.
num_trainable_tokens: Make the first n tokens in the vocab file as trainable
variables. Default is 3, which is "<unk>", "<s>" and "</s>".
"""
vocab, _ = load_vocab(vocab_file)
trainable_tokens = vocab[:num_trainable_tokens]
utils.print_out("# Using pretrained embedding: %s." % embed_file)
utils.print_out(" with trainable tokens: ")
emb_dict, emb_size = load_embed_txt(embed_file)
for token in trainable_tokens:
utils.print_out(" %s" % token)
if token not in emb_dict:
emb_dict[token] = [0.0] * emb_size
emb_mat = np.array(
[emb_dict[token] for token in vocab], dtype=dtype.as_numpy_dtype())
emb_mat = tf.constant(emb_mat)
emb_mat_const = tf.slice(emb_mat, [num_trainable_tokens, 0], [-1, -1])
with tf.variable_scope(scope or "pretrain_embeddings", dtype=dtype) as scope:
with tf.device(_get_embed_device(num_trainable_tokens)):
emb_mat_var = tf.get_variable(
"emb_mat_var", [num_trainable_tokens, emb_size])
#把需要训练的变量和不需要训练的常量连在一起行成新的张量,其中部分是需要训练的
return tf.concat([emb_mat_var, emb_mat_const], 0)
#从文件加载预训练词嵌入或者从头开始训练
def _create_or_load_embed(embed_name, vocab_file, embed_file,
vocab_size, embed_size, dtype):
"""Create a new or load an existing embedding matrix."""
if vocab_file and embed_file:
embedding = _create_pretrained_emb_from_txt(vocab_file, embed_file)
else:
with tf.device(_get_embed_device(vocab_size)):
embedding = tf.get_variable(
embed_name, [vocab_size, embed_size], dtype)
return embedding
#创建编码器和解码器的词嵌入层
def create_emb_for_encoder_and_decoder(share_vocab,
src_vocab_size,
tgt_vocab_size,
src_embed_size,
tgt_embed_size,
dtype=tf.float32,
num_partitions=0,
src_vocab_file=None,
tgt_vocab_file=None,
src_embed_file=None,
tgt_embed_file=None,
scope=None):
"""Create embedding matrix for both encoder and decoder.
Args:
share_vocab: A boolean. Whether to share embedding matrix for both
encoder and decoder.
src_vocab_size: An integer. The source vocab size.
tgt_vocab_size: An integer. The target vocab size.
src_embed_size: An integer. The embedding dimension for the encoder's
embedding.
tgt_embed_size: An integer. The embedding dimension for the decoder's
embedding.
dtype: dtype of the embedding matrix. Default to float32.
num_partitions: number of partitions used for the embedding vars.
scope: VariableScope for the created subgraph. Default to "embedding".
Returns:
embedding_encoder: Encoder's embedding matrix.
embedding_decoder: Decoder's embedding matrix.
Raises:
ValueError: if use share_vocab but source and target have different vocab
size.
"""
if num_partitions <= 1:
partitioner = None
else:
# Note: num_partitions > 1 is required for distributed training due to
# embedding_lookup tries to colocate single partition-ed embedding variable
# with lookup ops. This may cause embedding variables being placed on worker
# jobs.
partitioner = tf.fixed_size_partitioner(num_partitions)
if (src_embed_file or tgt_embed_file) and partitioner:
raise ValueError(
"Can't set num_partitions > 1 when using pretrained embedding")
with tf.variable_scope(
scope or "embeddings", dtype=dtype, partitioner=partitioner) as scope:
# Share embedding
if share_vocab:
if src_vocab_size != tgt_vocab_size:
raise ValueError("Share embedding but different src/tgt vocab sizes"
" %d vs. %d" % (src_vocab_size, tgt_vocab_size))
assert src_embed_size == tgt_embed_size
utils.print_out("# Use the same embedding for source and target")
vocab_file = src_vocab_file or tgt_vocab_file
embed_file = src_embed_file or tgt_embed_file
embedding_encoder = _create_or_load_embed(
"embedding_share", vocab_file, embed_file,
src_vocab_size, src_embed_size, dtype)
embedding_decoder = embedding_encoder
else:
with tf.variable_scope("encoder", partitioner=partitioner):
embedding_encoder = _create_or_load_embed(
"embedding_encoder", src_vocab_file, src_embed_file,
src_vocab_size, src_embed_size, dtype)
with tf.variable_scope("decoder", partitioner=partitioner):
embedding_decoder = _create_or_load_embed(
"embedding_decoder", tgt_vocab_file, tgt_embed_file,
tgt_vocab_size, tgt_embed_size, dtype)
return embedding_encoder, embedding_decoder
#获取对应的设备字符串
def get_device_str(device_id, num_gpus):
"""Return a device string for multi-GPU setup."""
if num_gpus == 0:
return "/cpu:0"
device_str_output = "/gpu:%d" % (device_id % num_gpus)
return device_str_output
#根据不同的RNN单元类型创建不同的单元
def _single_cell(unit_type, num_units, forget_bias, dropout, mode,
residual_connection=False, device_str=None, residual_fn=None):
"""Create an instance of a single RNN cell."""
# dropout (= 1 - keep_prob) is set to 0 during eval and infer
dropout = dropout if mode == tf.contrib.learn.ModeKeys.TRAIN else 0.0
# Cell Type
if unit_type == "lstm":
utils.print_out(" LSTM, forget_bias=%g" % forget_bias, new_line=False)
single_cell = tf.contrib.rnn.BasicLSTMCell(
num_units,
forget_bias=forget_bias)
elif unit_type == "gru":
utils.print_out(" GRU", new_line=False)
single_cell = tf.contrib.rnn.GRUCell(num_units)
elif unit_type == "layer_norm_lstm":
utils.print_out(" Layer Normalized LSTM, forget_bias=%g" % forget_bias,
new_line=False)
single_cell = tf.contrib.rnn.LayerNormBasicLSTMCell(
num_units,
forget_bias=forget_bias,
layer_norm=True)
elif unit_type == "nas":
utils.print_out(" NASCell", new_line=False)
single_cell = tf.contrib.rnn.NASCell(num_units)
else:
raise ValueError("Unknown unit type %s!" % unit_type)
#根据single_cell_fn创建rnn单元列表
def _cell_list(unit_type, num_units, num_layers, num_residual_layers,
forget_bias, dropout, mode, num_gpus, base_gpu=0,
single_cell_fn=None, residual_fn=None):
"""Create a list of RNN cells."""
if not single_cell_fn:
single_cell_fn = _single_cell
# Multi-GPU
cell_list = []
for i in range(num_layers):
utils.print_out(" cell %d" % i, new_line=False)
single_cell = single_cell_fn(
unit_type=unit_type,
num_units=num_units,
forget_bias=forget_bias,
dropout=dropout,
mode=mode,
residual_connection=(i >= num_layers - num_residual_layers),
device_str=get_device_str(i + base_gpu, num_gpus),
residual_fn=residual_fn
)
utils.print_out("")
cell_list.append(single_cell)
return cell_list
#根据RNN单元的长度创建cell
def create_rnn_cell(unit_type, num_units, num_layers, num_residual_layers,
forget_bias, dropout, mode, num_gpus, base_gpu=0,
single_cell_fn=None):
"""Create multi-layer RNN cell.
Args:
unit_type: string representing the unit type, i.e. "lstm".
num_units: the depth of each unit.
num_layers: number of cells.
num_residual_layers: Number of residual layers from top to bottom. For
example, if `num_layers=4` and `num_residual_layers=2`, the last 2 RNN
cells in the returned list will be wrapped with `ResidualWrapper`.
forget_bias: the initial forget bias of the RNNCell(s).
dropout: floating point value between 0.0 and 1.0:
the probability of dropout. this is ignored if `mode != TRAIN`.
mode: either tf.contrib.learn.TRAIN/EVAL/INFER
num_gpus: The number of gpus to use when performing round-robin
placement of layers.
base_gpu: The gpu device id to use for the first RNN cell in the
returned list. The i-th RNN cell will use `(base_gpu + i) % num_gpus`
as its device id.
single_cell_fn: allow for adding customized cell.
When not specified, we default to model_helper._single_cell
Returns:
An `RNNCell` instance.
"""
cell_list = _cell_list(unit_type=unit_type,
num_units=num_units,
num_layers=num_layers,
num_residual_layers=num_residual_layers,
forget_bias=forget_bias,
dropout=dropout,
mode=mode,
num_gpus=num_gpus,
base_gpu=base_gpu,
single_cell_fn=single_cell_fn)
if len(cell_list) == 1: # Single layer.
return cell_list[0]
else: # Multi layers
return tf.contrib.rnn.MultiRNNCell(cell_list)
import abc
class BaseModel(object):
"""Sequence-to-sequence base class.
"""
def __init__(self,
hparams,
mode,
iterator,
source_vocab_table,
target_vocab_table,
reverse_target_vocab_table=None,
scope=None,
extra_args=None):
"""Create the model.
Args:
hparams: Hyperparameter configurations.
mode: TRAIN | EVAL | INFER
iterator: Dataset Iterator that feeds data.
source_vocab_table: Lookup table mapping source words to ids.
target_vocab_table: Lookup table mapping target words to ids.
reverse_target_vocab_table: Lookup table mapping ids to target words. Only
required in INFER mode. Defaults to None.
scope: scope of the model.
extra_args: model_helper.ExtraArgs, for passing customizable functions.
"""
assert isinstance(iterator, BatchedInput)
#输入数据按照迭代器返回
self.iterator = iterator
self.mode = mode
#输入源和输出源的字典表
self.src_vocab_table = source_vocab_table
self.tgt_vocab_table = target_vocab_table
self.src_vocab_size = hparams.src_vocab_size
self.tgt_vocab_size = hparams.tgt_vocab_size
self.num_gpus = hparams.num_gpus
self.time_major = hparams.time_major
# extra_args: to make it flexible for adding external customizable code
self.single_cell_fn = None
if extra_args:
self.single_cell_fn = extra_args.single_cell_fn
# Set num layers
#编码器和解码器RNN的层数
self.num_encoder_layers = hparams.num_encoder_layers
self.num_decoder_layers = hparams.num_decoder_layers
assert self.num_encoder_layers
assert self.num_decoder_layers
# Set num residual layers
if hasattr(hparams, "num_residual_layers"): # compatible common_test_utils
self.num_encoder_residual_layers = hparams.num_residual_layers
self.num_decoder_residual_layers = hparams.num_residual_layers
else:
self.num_encoder_residual_layers = hparams.num_encoder_residual_layers
self.num_decoder_residual_layers = hparams.num_decoder_residual_layers
# Initializer
initializer = get_initializer(
hparams.init_op, hparams.random_seed, hparams.init_weight)
tf.get_variable_scope().set_initializer(initializer)
# Embeddings
self.init_embeddings(hparams, scope)
self.batch_size = tf.size(self.iterator.source_sequence_length)
# Projection
with tf.variable_scope(scope or "build_network"):
with tf.variable_scope("decoder/output_projection"):
self.output_layer = layers_core.Dense(
hparams.tgt_vocab_size, use_bias=False, name="output_projection")
## Train graph
##模型构建都在这个函数里面
res = self.build_graph(hparams, scope=scope)
if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
self.train_loss = res[1]
self.word_count = tf.reduce_sum(
self.iterator.source_sequence_length) + tf.reduce_sum(
self.iterator.target_sequence_length)
elif self.mode == tf.contrib.learn.ModeKeys.EVAL:
self.eval_loss = res[1]
elif self.mode == tf.contrib.learn.ModeKeys.INFER:
self.infer_logits, _, self.final_context_state, self.sample_id = res
self.sample_words = reverse_target_vocab_table.lookup(
tf.to_int64(self.sample_id))
if self.mode != tf.contrib.learn.ModeKeys.INFER:
## Count the number of predicted words for compute ppl.
self.predict_count = tf.reduce_sum(
self.iterator.target_sequence_length)
#用于记录训练的迭代次数
self.global_step = tf.Variable(0, trainable=False)
params = tf.trainable_variables()
# Gradients and SGD update operation for training the model.
# Arrage for the embedding vars to appear at the beginning.
# 构建模型训练优化器
if self.mode == tf.contrib.learn.ModeKeys.TRAIN:
self.learning_rate = tf.constant(hparams.learning_rate)
# warm-up
self.learning_rate = self._get_learning_rate_warmup(hparams)
# decay
self.learning_rate = self._get_learning_rate_decay(hparams)
# Optimizer
if hparams.optimizer == "sgd":
opt = tf.train.GradientDescentOptimizer(self.learning_rate)
tf.summary.scalar("lr", self.learning_rate)
elif hparams.optimizer == "adam":
opt = tf.train.AdamOptimizer(self.learning_rate)
# Gradients
gradients = tf.gradients(
self.train_loss,
params,
colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)
clipped_grads, grad_norm_summary, grad_norm = gradient_clip(
gradients, max_gradient_norm=hparams.max_gradient_norm)
self.grad_norm = grad_norm
#应用梯度更新参数
self.update = opt.apply_gradients(
zip(clipped_grads, params), global_step=self.global_step)
# Summary
self.train_summary = tf.summary.merge([
tf.summary.scalar("lr", self.learning_rate),
tf.summary.scalar("train_loss", self.train_loss),
] + grad_norm_summary)
if self.mode == tf.contrib.learn.ModeKeys.INFER:
self.infer_summary = self._get_infer_summary(hparams)
# Saver
self.saver = tf.train.Saver(
tf.global_variables(), max_to_keep=hparams.num_keep_ckpts)
# Print trainable variables
utils.print_out("# Trainable variables")
for param in params:
utils.print_out(" %s, %s, %s" % (param.name, str(param.get_shape()),
param.op.device))
def _get_learning_rate_warmup(self, hparams):
"""Get learning rate warmup."""
warmup_steps = hparams.warmup_steps
warmup_scheme = hparams.warmup_scheme
utils.print_out(" learning_rate=%g, warmup_steps=%d, warmup_scheme=%s" %
(hparams.learning_rate, warmup_steps, warmup_scheme))
# Apply inverse decay if global steps less than warmup steps.
# Inspired by https://arxiv.org/pdf/1706.03762.pdf (Section 5.3)
# When step < warmup_steps,
# learing_rate *= warmup_factor ** (warmup_steps - step)
if warmup_scheme == "t2t":
# 0.01^(1/warmup_steps): we start with a lr, 100 times smaller
warmup_factor = tf.exp(tf.log(0.01) / warmup_steps)
inv_decay = warmup_factor**(
tf.to_float(warmup_steps - self.global_step))
else:
raise ValueError("Unknown warmup scheme %s" % warmup_scheme)
return tf.cond(
self.global_step < hparams.warmup_steps,
lambda: inv_decay * self.learning_rate,
lambda: self.learning_rate,
name="learning_rate_warump_cond")
def _get_learning_rate_decay(self, hparams):
"""Get learning rate decay."""
if hparams.decay_scheme in ["luong5", "luong10", "luong234"]:
decay_factor = 0.5
if hparams.decay_scheme == "luong5":
start_decay_step = int(hparams.num_train_steps / 2)
decay_times = 5
elif hparams.decay_scheme == "luong10":
start_decay_step = int(hparams.num_train_steps / 2)
decay_times = 10
elif hparams.decay_scheme == "luong234":
start_decay_step = int(hparams.num_train_steps * 2 / 3)
decay_times = 4
remain_steps = hparams.num_train_steps - start_decay_step
decay_steps = int(remain_steps / decay_times)
elif not hparams.decay_scheme: # no decay
start_decay_step = hparams.num_train_steps
decay_steps = 0
decay_factor = 1.0
elif hparams.decay_scheme:
raise ValueError("Unknown decay scheme %s" % hparams.decay_scheme)
utils.print_out(" decay_scheme=%s, start_decay_step=%d, decay_steps %d, "
"decay_factor %g" % (hparams.decay_scheme,
start_decay_step,
decay_steps,
decay_factor))
return tf.cond(
self.global_step < start_decay_step,
lambda: self.learning_rate,
lambda: tf.train.exponential_decay(
self.learning_rate,
(self.global_step - start_decay_step),
decay_steps, decay_factor, staircase=True),
name="learning_rate_decay_cond")
#调用外部函数创建词嵌入层
def init_embeddings(self, hparams, scope):
"""Init embeddings."""
self.embedding_encoder, self.embedding_decoder = (
create_emb_for_encoder_and_decoder(
share_vocab=hparams.share_vocab,
src_vocab_size=self.src_vocab_size,
tgt_vocab_size=self.tgt_vocab_size,
src_embed_size=hparams.num_units,
tgt_embed_size=hparams.num_units,
num_partitions=hparams.num_embeddings_partitions,
src_vocab_file=hparams.src_vocab_file,
tgt_vocab_file=hparams.tgt_vocab_file,
src_embed_file=hparams.src_embed_file,
tgt_embed_file=hparams.tgt_embed_file,
scope=scope,))
#训练模型
def train(self, sess):
assert self.mode == tf.contrib.learn.ModeKeys.TRAIN
return sess.run([self.update,
self.train_loss,
self.predict_count,
self.train_summary,
self.global_step,
self.word_count,
self.batch_size,
self.grad_norm,
self.learning_rate])
#评估模型
def eval(self, sess):
assert self.mode == tf.contrib.learn.ModeKeys.EVAL
return sess.run([self.eval_loss,
self.predict_count,
self.batch_size])
#构建模型
def build_graph(self, hparams, scope=None):
"""Subclass must implement this method.
Creates a sequence-to-sequence model with dynamic RNN decoder API.
Args:
hparams: Hyperparameter configurations.
scope: VariableScope for the created subgraph; default "dynamic_seq2seq".
Returns:
A tuple of the form (logits, loss, final_context_state),
where:
logits: float32 Tensor [batch_size x num_decoder_symbols].
loss: the total loss / batch_size.
final_context_state: The final state of decoder RNN.
Raises:
ValueError: if encoder_type differs from mono and bi, or
attention_option is not (luong | scaled_luong |
bahdanau | normed_bahdanau).
"""
utils.print_out("# creating %s graph ..." % self.mode)
dtype = tf.float32
with tf.variable_scope(scope or "dynamic_seq2seq", dtype=dtype):
# Encoder
#构建编码器,输出编码器每个时间步差的输出和最终的状态
encoder_outputs, encoder_state = self._build_encoder(hparams)
## Decoder
#构建解码器,编码器的最终状态作为解码器的初始化输入
logits, sample_id, final_context_state = self._build_decoder(
encoder_outputs, encoder_state, hparams)
## Loss
if self.mode != tf.contrib.learn.ModeKeys.INFER:
with tf.device(get_device_str(self.num_encoder_layers - 1,
self.num_gpus)):
loss = self._compute_loss(logits)
else:
loss = None
return logits, loss, final_context_state, sample_id
#后续的模型都需要实现这个方法
@abc.abstractmethod
def _build_encoder(self, hparams):
"""Subclass must implement this.
Build and run an RNN encoder.
Args:
hparams: Hyperparameters configurations.
Returns:
A tuple of encoder_outputs and encoder_state.
"""
pass
#创建编码器RNN列表
def _build_encoder_cell(self, hparams, num_layers, num_residual_layers,
base_gpu=0):
"""Build a multi-layer RNN cell that can be used by encoder."""
return create_rnn_cell(
unit_type=hparams.unit_type,
num_units=hparams.num_units,
num_layers=num_layers,
num_residual_layers=num_residual_layers,
forget_bias=hparams.forget_bias,
dropout=hparams.dropout,
num_gpus=hparams.num_gpus,
mode=self.mode,
base_gpu=base_gpu,
single_cell_fn=self.single_cell_fn)
def _get_infer_maximum_iterations(self, hparams, source_sequence_length):
"""Maximum decoding steps at inference time."""
if hparams.tgt_max_len_infer:
maximum_iterations = hparams.tgt_max_len_infer
utils.print_out(" decoding maximum_iterations %d" % maximum_iterations)
else:
# TODO(thangluong): add decoding_length_factor flag
decoding_length_factor = 2.0
max_encoder_length = tf.reduce_max(source_sequence_length)
maximum_iterations = tf.to_int32(tf.round(
tf.to_float(max_encoder_length) * decoding_length_factor))
return maximum_iterations
#创建解码器
def _build_decoder(self, encoder_outputs, encoder_state, hparams):
"""Build and run a RNN decoder with a final projection layer.
Args:
encoder_outputs: The outputs of encoder for every time step.
encoder_state: The final state of the encoder.
hparams: The Hyperparameters configurations.
Returns:
A tuple of final logits and final decoder state:
logits: size [time, batch_size, vocab_size] when time_major=True.
"""
tgt_sos_id = tf.cast(self.tgt_vocab_table.lookup(tf.constant(hparams.sos)),
tf.int32)
tgt_eos_id = tf.cast(self.tgt_vocab_table.lookup(tf.constant(hparams.eos)),
tf.int32)
iterator = self.iterator
# maximum_iteration: The maximum decoding steps.
maximum_iterations = self._get_infer_maximum_iterations(
hparams, iterator.source_sequence_length)
## Decoder.
with tf.variable_scope("decoder") as decoder_scope:
cell, decoder_initial_state = self._build_decoder_cell(
hparams, encoder_outputs, encoder_state,
iterator.source_sequence_length)
## Train or eval
# 模型训练或者评估
if self.mode != tf.contrib.learn.ModeKeys.INFER:
# decoder_emp_inp: [max_time, batch_size, num_units]
target_input = iterator.target_input
if self.time_major:
target_input = tf.transpose(target_input)
decoder_emb_inp = tf.nn.embedding_lookup(
self.embedding_decoder, target_input)
# Helper
#解码器目标输入的辅助函数
helper = tf.contrib.seq2seq.TrainingHelper(
decoder_emb_inp, iterator.target_sequence_length,
time_major=self.time_major)
# Decoder
#构建解码器
my_decoder = tf.contrib.seq2seq.BasicDecoder(
cell,
helper,
decoder_initial_state,)
# Dynamic decoding
# outputs 类型为BasicDecoderOutput(rnn_output, sample_id)
outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
my_decoder,
output_time_major=self.time_major,
swap_memory=True,
scope=decoder_scope)
sample_id = outputs.sample_id
# Note: there's a subtle difference here between train and inference.
# We could have set output_layer when create my_decoder
# and shared more code between train and inference.
# We chose to apply the output_layer to all timesteps for speed:
# 10% improvements for small models & 20% for larger ones.
# If memory is a concern, we should apply output_layer per timestep.
logits = self.output_layer(outputs.rnn_output)
## Inference
else:
beam_width = hparams.beam_width
length_penalty_weight = hparams.length_penalty_weight
#按照batch_size大小填充sos符号代表句子的开头
start_tokens = tf.fill([self.batch_size], tgt_sos_id)
end_token = tgt_eos_id
if beam_width > 0:
my_decoder = tf.contrib.seq2seq.BeamSearchDecoder(
cell=cell,
embedding=self.embedding_decoder,
start_tokens=start_tokens,
end_token=end_token,
initial_state=decoder_initial_state,
beam_width=beam_width,
output_layer=self.output_layer,
length_penalty_weight=length_penalty_weight)
else:
# Helper
sampling_temperature = hparams.sampling_temperature
#sampling_temperature大于1时,越大代表随机性越强,越小代表越接近取argmax最大值,默认是1.0
if sampling_temperature > 0.0:
helper = tf.contrib.seq2seq.SampleEmbeddingHelper(
self.embedding_decoder, start_tokens, end_token,
softmax_temperature=sampling_temperature,
seed=hparams.random_seed)
else:
helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
self.embedding_decoder, start_tokens, end_token)
# Decoder
my_decoder = tf.contrib.seq2seq.BasicDecoder(
cell,
helper,
decoder_initial_state,
output_layer=self.output_layer # applied per timestep
)
# Dynamic decoding
outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
my_decoder,
maximum_iterations=maximum_iterations,
output_time_major=self.time_major,
swap_memory=True,
scope=decoder_scope)
if beam_width > 0:
#Does nothing
logits = tf.no_op()
#当cell为BeamsearchDecorder时dynamic_decode返回的outputs为
#FinalBeamSearchDecoderOutput(predicted_ids,beam_search_decoder_output)
sample_id = outputs.predicted_ids
else:
logits = outputs.rnn_output
sample_id = outputs.sample_id
return logits, sample_id, final_context_state
def get_max_time(self, tensor):
time_axis = 0 if self.time_major else 1
return tensor.shape[time_axis].value or tf.shape(tensor)[time_axis]
#继承类需要实现的方法
@abc.abstractmethod
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
source_sequence_length):
"""Subclass must implement this.
Args:
hparams: Hyperparameters configurations.
encoder_outputs: The outputs of encoder for every time step.
encoder_state: The final state of the encoder.
source_sequence_length: sequence length of encoder_outputs.
Returns:
A tuple of a multi-layer RNN cell used by decoder
and the intial state of the decoder RNN.
"""
pass
def _compute_loss(self, logits):
"""Compute optimization loss."""
target_output = self.iterator.target_output
if self.time_major:
target_output = tf.transpose(target_output)
max_time = self.get_max_time(target_output)
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=target_output, logits=logits)
target_weights = tf.sequence_mask(
self.iterator.target_sequence_length, max_time, dtype=logits.dtype)
if self.time_major:
target_weights = tf.transpose(target_weights)
loss = tf.reduce_sum(
crossent * target_weights) / tf.to_float(self.batch_size)
return loss
def _get_infer_summary(self, hparams):
return tf.no_op()
def infer(self, sess):
assert self.mode == tf.contrib.learn.ModeKeys.INFER
return sess.run([
self.infer_logits, self.infer_summary, self.sample_id, self.sample_words
])
def decode(self, sess):
"""Decode a batch.
Args:
sess: tensorflow session to use.
Returns:
A tuple consiting of outputs, infer_summary.
outputs: of size [batch_size, time]
"""
_, infer_summary, _, sample_words = self.infer(sess)
# make sure outputs is of shape [batch_size, time] or [beam_width,
# batch_size, time] when using beam search.
if self.time_major:
sample_words = sample_words.transpose()
elif sample_words.ndim == 3: # beam search output in [batch_size,
# time, beam_width] shape.
sample_words = sample_words.transpose([2, 0, 1])
return sample_words, infer_summary
class Model(BaseModel):
"""Sequence-to-sequence dynamic model.
This class implements a multi-layer recurrent neural network as encoder,
and a multi-layer recurrent neural network decoder.
"""
def _build_encoder(self, hparams):
"""Build an encoder."""
num_layers = self.num_encoder_layers
num_residual_layers = self.num_encoder_residual_layers
iterator = self.iterator
source = iterator.source
if self.time_major:
source = tf.transpose(source)
with tf.variable_scope("encoder") as scope:
dtype = scope.dtype
# Look up embedding, emp_inp: [max_time, batch_size, num_units]
encoder_emb_inp = tf.nn.embedding_lookup(
self.embedding_encoder, source)
# Encoder_outputs: [max_time, batch_size, num_units]
if hparams.encoder_type == "uni":
utils.print_out(" num_layers = %d, num_residual_layers=%d" %
(num_layers, num_residual_layers))
cell = self._build_encoder_cell(
hparams, num_layers, num_residual_layers)
#编码器需要把源encoder_emb_inp作为输入
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
cell,
encoder_emb_inp,
dtype=dtype,
sequence_length=iterator.source_sequence_length,
time_major=self.time_major,
swap_memory=True)
elif hparams.encoder_type == "bi":
num_bi_layers = int(num_layers / 2)
num_bi_residual_layers = int(num_residual_layers / 2)
utils.print_out(" num_bi_layers = %d, num_bi_residual_layers=%d" %
(num_bi_layers, num_bi_residual_layers))
encoder_outputs, bi_encoder_state = (
self._build_bidirectional_rnn(
inputs=encoder_emb_inp,
sequence_length=iterator.source_sequence_length,
dtype=dtype,
hparams=hparams,
num_bi_layers=num_bi_layers,
num_bi_residual_layers=num_bi_residual_layers))
if num_bi_layers == 1:
encoder_state = bi_encoder_state
else:
# alternatively concat forward and backward states
encoder_state = []
for layer_id in range(num_bi_layers):
encoder_state.append(bi_encoder_state[0][layer_id]) # forward
encoder_state.append(bi_encoder_state[1][layer_id]) # backward
encoder_state = tuple(encoder_state)
else:
raise ValueError("Unknown encoder_type %s" % hparams.encoder_type)
return encoder_outputs, encoder_state
def _build_bidirectional_rnn(self, inputs, sequence_length,
dtype, hparams,
num_bi_layers,
num_bi_residual_layers,
base_gpu=0):
"""Create and call biddirectional RNN cells.
Args:
num_residual_layers: Number of residual layers from top to bottom. For
example, if `num_bi_layers=4` and `num_residual_layers=2`, the last 2 RNN
layers in each RNN cell will be wrapped with `ResidualWrapper`.
base_gpu: The gpu device id to use for the first forward RNN layer. The
i-th forward RNN layer will use `(base_gpu + i) % num_gpus` as its
device id. The `base_gpu` for backward RNN cell is `(base_gpu +
num_bi_layers)`.
Returns:
The concatenated bidirectional output and the bidirectional RNN cell"s
state.
"""
# Construct forward and backward cells
fw_cell = self._build_encoder_cell(hparams,
num_bi_layers,
num_bi_residual_layers,
base_gpu=base_gpu)
bw_cell = self._build_encoder_cell(hparams,
num_bi_layers,
num_bi_residual_layers,
base_gpu=(base_gpu + num_bi_layers))
bi_outputs, bi_state = tf.nn.bidirectional_dynamic_rnn(
fw_cell,
bw_cell,
inputs,
dtype=dtype,
sequence_length=sequence_length,
time_major=self.time_major,
swap_memory=True)
return tf.concat(bi_outputs, -1), bi_state
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
source_sequence_length):
"""Build an RNN cell that can be used by decoder."""
# We only make use of encoder_outputs in attention-based models
if hparams.attention:
raise ValueError("BasicModel doesn't support attention.")
cell = create_rnn_cell(
unit_type=hparams.unit_type,
num_units=hparams.num_units,
num_layers=self.num_decoder_layers,
num_residual_layers=self.num_decoder_residual_layers,
forget_bias=hparams.forget_bias,
dropout=hparams.dropout,
num_gpus=self.num_gpus,
mode=self.mode,
single_cell_fn=self.single_cell_fn)
# For beam search, we need to replicate encoder infos beam_width times
if self.mode == tf.contrib.learn.ModeKeys.INFER and hparams.beam_width > 0:
decoder_initial_state = tf.contrib.seq2seq.tile_batch(
encoder_state, multiplier=hparams.beam_width)
else:
decoder_initial_state = encoder_state
return cell, decoder_initial_state
class AttentionModel(Model):
"""Sequence-to-sequence dynamic model with attention.
This class implements a multi-layer recurrent neural network as encoder,
and an attention-based decoder. This is the same as the model described in
(Luong et al., EMNLP'2015) paper: https://arxiv.org/pdf/1508.04025v5.pdf.
This class also allows to use GRU cells in addition to LSTM cells with
support for dropout.
"""
def __init__(self,
hparams,
mode,
iterator,
source_vocab_table,
target_vocab_table,
reverse_target_vocab_table=None,
scope=None,
extra_args=None):
# Set attention_mechanism_fn
if extra_args and extra_args.attention_mechanism_fn:
self.attention_mechanism_fn = extra_args.attention_mechanism_fn
else:
self.attention_mechanism_fn = create_attention_mechanism
super(AttentionModel, self).__init__(
hparams=hparams,
mode=mode,
iterator=iterator,
source_vocab_table=source_vocab_table,
target_vocab_table=target_vocab_table,
reverse_target_vocab_table=reverse_target_vocab_table,
scope=scope,
extra_args=extra_args)
if self.mode == tf.contrib.learn.ModeKeys.INFER:
self.infer_summary = self._get_infer_summary(hparams)
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
source_sequence_length):
"""Build a RNN cell with attention mechanism that can be used by decoder."""
attention_option = hparams.attention
attention_architecture = hparams.attention_architecture
if attention_architecture != "standard":
raise ValueError(
"Unknown attention architecture %s" % attention_architecture)
num_units = hparams.num_units
num_layers = self.num_decoder_layers
num_residual_layers = self.num_decoder_residual_layers
beam_width = hparams.beam_width
dtype = tf.float32
# Ensure memory is batch-major
if self.time_major:
memory = tf.transpose(encoder_outputs, [1, 0, 2])
else:
memory = encoder_outputs
if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0:
memory = tf.contrib.seq2seq.tile_batch(
memory, multiplier=beam_width)
source_sequence_length = tf.contrib.seq2seq.tile_batch(
source_sequence_length, multiplier=beam_width)
encoder_state = tf.contrib.seq2seq.tile_batch(
encoder_state, multiplier=beam_width)
batch_size = self.batch_size * beam_width
else:
batch_size = self.batch_size
attention_mechanism = self.attention_mechanism_fn(
attention_option, num_units, memory, source_sequence_length, self.mode)
cell = create_rnn_cell(
unit_type=hparams.unit_type,
num_units=num_units,
num_layers=num_layers,
num_residual_layers=num_residual_layers,
forget_bias=hparams.forget_bias,
dropout=hparams.dropout,
num_gpus=self.num_gpus,
mode=self.mode,
single_cell_fn=self.single_cell_fn)
# Only generate alignment in greedy INFER mode.
alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
beam_width == 0)
cell = tf.contrib.seq2seq.AttentionWrapper(
cell,
attention_mechanism,
attention_layer_size=num_units,
alignment_history=alignment_history,
output_attention=hparams.output_attention,
name="attention")
# TODO(thangluong): do we need num_layers, num_gpus?
cell = tf.contrib.rnn.DeviceWrapper(cell,
get_device_str(
num_layers - 1, self.num_gpus))
if hparams.pass_hidden_state:
decoder_initial_state = cell.zero_state(batch_size, dtype).clone(
cell_state=encoder_state)
else:
decoder_initial_state = cell.zero_state(batch_size, dtype)
return cell, decoder_initial_state
def _get_infer_summary(self, hparams):
if hparams.beam_width > 0:
return tf.no_op()
return _create_attention_images_summary(self.final_context_state)
def create_attention_mechanism(attention_option, num_units, memory,
source_sequence_length, mode):
"""Create attention mechanism based on the attention_option."""
del mode # unused
# Mechanism
if attention_option == "luong":
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
num_units, memory, memory_sequence_length=source_sequence_length)
elif attention_option == "scaled_luong":
attention_mechanism = tf.contrib.seq2seq.LuongAttention(
num_units,
memory,
memory_sequence_length=source_sequence_length,
scale=True)
elif attention_option == "bahdanau":
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
num_units, memory, memory_sequence_length=source_sequence_length)
elif attention_option == "normed_bahdanau":
attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(
num_units,
memory,
memory_sequence_length=source_sequence_length,
normalize=True)
else:
raise ValueError("Unknown attention option %s" % attention_option)
return attention_mechanism
def _create_attention_images_summary(final_context_state):
"""create attention image and attention summary."""
attention_images = (final_context_state.alignment_history.stack())
# Reshape to (batch, src_seq_len, tgt_seq_len,1)
attention_images = tf.expand_dims(
tf.transpose(attention_images, [1, 2, 0]), -1)
# Scale to range [0, 255]
attention_images *= 255
attention_summary = tf.summary.image("attention_images", attention_images)
return attention_summary
class GNMTModel(AttentionModel):
"""Sequence-to-sequence dynamic model with GNMT attention architecture.
"""
def __init__(self,
hparams,
mode,
iterator,
source_vocab_table,
target_vocab_table,
reverse_target_vocab_table=None,
scope=None,
extra_args=None):
super(GNMTModel, self).__init__(
hparams=hparams,
mode=mode,
iterator=iterator,
source_vocab_table=source_vocab_table,
target_vocab_table=target_vocab_table,
reverse_target_vocab_table=reverse_target_vocab_table,
scope=scope,
extra_args=extra_args)
def _build_encoder(self, hparams):
"""Build a GNMT encoder."""
if hparams.encoder_type == "uni" or hparams.encoder_type == "bi":
return super(GNMTModel, self)._build_encoder(hparams)
if hparams.encoder_type != "gnmt":
raise ValueError("Unknown encoder_type %s" % hparams.encoder_type)
# Build GNMT encoder.
num_bi_layers = 1
num_uni_layers = self.num_encoder_layers - num_bi_layers
utils.print_out(" num_bi_layers = %d" % num_bi_layers)
utils.print_out(" num_uni_layers = %d" % num_uni_layers)
iterator = self.iterator
source = iterator.source
if self.time_major:
source = tf.transpose(source)
with tf.variable_scope("encoder") as scope:
dtype = scope.dtype
# Look up embedding, emp_inp: [max_time, batch_size, num_units]
# when time_major = True
encoder_emb_inp = tf.nn.embedding_lookup(self.embedding_encoder,
source)
# Execute _build_bidirectional_rnn from Model class
bi_encoder_outputs, bi_encoder_state = self._build_bidirectional_rnn(
inputs=encoder_emb_inp,
sequence_length=iterator.source_sequence_length,
dtype=dtype,
hparams=hparams,
num_bi_layers=num_bi_layers,
num_bi_residual_layers=0, # no residual connection
)
uni_cell = create_rnn_cell(
unit_type=hparams.unit_type,
num_units=hparams.num_units,
num_layers=num_uni_layers,
num_residual_layers=self.num_encoder_residual_layers,
forget_bias=hparams.forget_bias,
dropout=hparams.dropout,
num_gpus=self.num_gpus,
base_gpu=1,
mode=self.mode,
single_cell_fn=self.single_cell_fn)
# encoder_outputs: size [max_time, batch_size, num_units]
# when time_major = True
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
uni_cell,
bi_encoder_outputs,
dtype=dtype,
sequence_length=iterator.source_sequence_length,
time_major=self.time_major)
# Pass all encoder state except the first bi-directional layer's state to
# decoder.
encoder_state = (bi_encoder_state[1],) + (
(encoder_state,) if num_uni_layers == 1 else encoder_state)
return encoder_outputs, encoder_state
def _build_decoder_cell(self, hparams, encoder_outputs, encoder_state,
source_sequence_length):
"""Build a RNN cell with GNMT attention architecture."""
# Standard attention
if hparams.attention_architecture == "standard":
return super(GNMTModel, self)._build_decoder_cell(
hparams, encoder_outputs, encoder_state, source_sequence_length)
# GNMT attention
attention_option = hparams.attention
attention_architecture = hparams.attention_architecture
num_units = hparams.num_units
beam_width = hparams.beam_width
dtype = tf.float32
if self.time_major:
memory = tf.transpose(encoder_outputs, [1, 0, 2])
else:
memory = encoder_outputs
if self.mode == tf.contrib.learn.ModeKeys.INFER and beam_width > 0:
memory = tf.contrib.seq2seq.tile_batch(
memory, multiplier=beam_width)
source_sequence_length = tf.contrib.seq2seq.tile_batch(
source_sequence_length, multiplier=beam_width)
encoder_state = tf.contrib.seq2seq.tile_batch(
encoder_state, multiplier=beam_width)
batch_size = self.batch_size * beam_width
else:
batch_size = self.batch_size
attention_mechanism = self.attention_mechanism_fn(
attention_option, num_units, memory, source_sequence_length, self.mode)
cell_list = _cell_list( # pylint: disable=protected-access
unit_type=hparams.unit_type,
num_units=num_units,
num_layers=self.num_decoder_layers,
num_residual_layers=self.num_decoder_residual_layers,
forget_bias=hparams.forget_bias,
dropout=hparams.dropout,
num_gpus=self.num_gpus,
mode=self.mode,
single_cell_fn=self.single_cell_fn,
residual_fn=gnmt_residual_fn
)
# Only wrap the bottom layer with the attention mechanism.
attention_cell = cell_list.pop(0)
# Only generate alignment in greedy INFER mode.
alignment_history = (self.mode == tf.contrib.learn.ModeKeys.INFER and
beam_width == 0)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(
attention_cell,
attention_mechanism,
attention_layer_size=None, # don't use attention layer.
output_attention=False,
alignment_history=alignment_history,
name="attention")
if attention_architecture == "gnmt":
cell = GNMTAttentionMultiCell(
attention_cell, cell_list)
elif attention_architecture == "gnmt_v2":
cell = GNMTAttentionMultiCell(
attention_cell, cell_list, use_new_attention=True)
else:
raise ValueError(
"Unknown attention_architecture %s" % attention_architecture)
if hparams.pass_hidden_state:
decoder_initial_state = tuple(
zs.clone(cell_state=es)
if isinstance(zs, tf.contrib.seq2seq.AttentionWrapperState) else es
for zs, es in zip(
cell.zero_state(batch_size, dtype), encoder_state))
else:
decoder_initial_state = cell.zero_state(batch_size, dtype)
return cell, decoder_initial_state
def _get_infer_summary(self, hparams):
# Standard attention
if hparams.attention_architecture == "standard":
return super(GNMTModel, self)._get_infer_summary(hparams)
# GNMT attention
if hparams.beam_width > 0:
return tf.no_op()
return attention_model._create_attention_images_summary(
self.final_context_state[0])
class GNMTAttentionMultiCell(tf.contrib.rnn.MultiRNNCell ):
"""A MultiCell with GNMT attention style."""
def __init__(self, attention_cell, cells, use_new_attention=False):
"""Creates a GNMTAttentionMultiCell.
Args:
attention_cell: An instance of AttentionWrapper.
cells: A list of RNNCell wrapped with AttentionInputWrapper.
use_new_attention: Whether to use the attention generated from current
step bottom layer's output. Default is False.
"""
cells = [attention_cell] + cells
self.use_new_attention = use_new_attention
super(GNMTAttentionMultiCell, self).__init__(cells, state_is_tuple=True)
def __call__(self, inputs, state, scope=None):
"""Run the cell with bottom layer's attention copied to all upper layers."""
if not nest.is_sequence(state):
raise ValueError(
"Expected state to be a tuple of length %d, but received: %s"
% (len(self.state_size), state))
with tf.variable_scope(scope or "multi_rnn_cell"):
new_states = []
with tf.variable_scope("cell_0_attention"):
attention_cell = self._cells[0]
attention_state = state[0]
cur_inp, new_attention_state = attention_cell(inputs, attention_state)
new_states.append(new_attention_state)
for i in range(1, len(self._cells)):
with tf.variable_scope("cell_%d" % i):
cell = self._cells[i]
cur_state = state[i]
if self.use_new_attention:
cur_inp = tf.concat([cur_inp, new_attention_state.attention], -1)
else:
cur_inp = tf.concat([cur_inp, attention_state.attention], -1)
cur_inp, new_state = cell(cur_inp, cur_state)
new_states.append(new_state)
return cur_inp, tuple(new_states)
def gnmt_residual_fn(inputs, outputs):
"""Residual function that handles different inputs and outputs inner dims.
Args:
inputs: cell inputs, this is actual inputs concatenated with the attention
vector.
outputs: cell outputs
Returns:
outputs + actual inputs
"""
def split_input(inp, out):
out_dim = out.get_shape().as_list()[-1]
inp_dim = inp.get_shape().as_list()[-1]
return tf.split(inp, [out_dim, inp_dim - out_dim], axis=-1)
actual_inputs, _ = nest.map_structure(split_input, inputs, outputs)
def assert_shape_match(inp, out):
inp.get_shape().assert_is_compatible_with(out.get_shape())
nest.assert_same_structure(actual_inputs, outputs)
nest.map_structure(assert_shape_match, actual_inputs, outputs)
return nest.map_structure(lambda inp, out: inp + out, actual_inputs, outputs)