WILDML博客关于对话系统的现状概括,他基于Improved Deep Learning Baselines for Ubuntu Corpus Dialogs实现了Dual Encoder LSTM的基于检索的系统。为了加深理解,本文对其中的关键代码进行注释。
对话系统按照不同的纬度可分为:
基于检索的和生成模型
长对话和短对话
开放域和封闭域对话
Dual Encoder LSTM模型是通过对问题和答案都使用相同的LSTM对其编码形成最终的输出向量,再通过比较向量的相似性判断最优的答案。但问题是在预测的时候,需要把问题和所有的准备的答案比较,这显然比较慢;或者把问题归类,在特定问题类里面寻找答案。
作者使用随机返回答案和基于td-idf相似性计算返回的答案作为基线,发现Dual Encoder LSTM模型有一定的提高。
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
TRAIN_PATH = 'chatbot-retrieval-master/data/train.csv'
VALIDATION_PATH = 'chatbot-retrieval-master/data/test.csv'
TEST_PATH = 'chatbot-retrieval-master/data/valid.csv'
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
validation_df = pd.read_csv(VALIDATION_PATH)
y_test = np.zeros(len(test_df))
训练样本问题是Context,而答案是Utterance,而Label为1表示匹配的答案对,0表示不是匹配的答案对
train_df.head(20)
验证数据和测试数据每个问题有一个正确的答案和9个从其他地方随机选择的答案
validation_df.head(2)
test_df.head(2)
Recall@k评测指标是预测的结果在k个指定的答案之中
#Recall@k的评估函数
def evaluate_recall(y,y_test,k=1):
"""
y:是降序排序的预测结果
y_test:是真实的值
"""
num_examples=float(len(y))
num_corret=0
for predictions,label in zip(y,y_test):
if label in predictions[:k]:
num_corret += 1
return num_corret/num_examples
通过随机返回答案的方式,测试Recall@k,从结果看Recall@k的结果符合概率的结果
#随机选择十个答案
def predict_random(context, utterances):
return np.random.choice(len(utterances), 10, replace=False)
#pandas.iloc比较简单,它是基于索引位来选取数据集,values把DataFrame的数取为数组形式
y_random = [predict_random(test_df.Context[x], test_df.iloc[x,1:].values) for x in range(len(test_df))]
for n in [1, 2, 5, 10]:
print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y_random, y_test, n)))
通过tf-idf模型测试Recall@k的效果,发现比随机的方式有了很大的提高,随机方式Recall@1只有10%的正确率,而tf-idf达到了48%
class TFIDFPredictor:
def __init__(self):
self.vectorizer = TfidfVectorizer()
def train(self, data):
self.vectorizer.fit(np.append(data.Context.values,data.Utterance.values))
def predict(self, context, utterances):
# Convert context and utterances into tfidf vector
vector_context = self.vectorizer.transform([context])
vector_doc = self.vectorizer.transform(utterances)
# The dot product measures the similarity of the resulting vectors
result = np.dot(vector_doc, vector_context.T).todense()
result = np.asarray(result).flatten()
# Sort by top results and return the indices in descending order
return np.argsort(result, axis=0)[::-1]
pred = TFIDFPredictor()
pred.train(train_df)
y = [pred.predict(test_df.Context[x], test_df.iloc[x,1:].values) for x in range(len(test_df))]
for n in [1, 2, 5, 10]:
print("Recall @ ({}, 10): {:g}".format(n, evaluate_recall(y, y_test, n)))
实现Dual Encoder LSTM模型
import csv
f=csv.reader(open('chatbot-retrieval-master/data/train.csv'))
min_word_frequency=5
max_sentence_len=160
input_dir='chatbot-retrieval-master/data/'
output_dir=input_dir
1)把数据转化为tfrecords格式
#英文分词函数
def tokenizer_fn(iterator):
return (x.split(" ") for x in iterator)
#读取csv文件的函数
def create_csv_iter(filename):
"""
Returns an iterator over a CSV file. Skips the header.
"""
with open(filename) as csvfile:
reader = csv.reader(csvfile)
# Skip the header
next(reader)
for row in reader:
yield row
#创建字典
def create_vocab(input_iter, min_frequency):
"""
Creates and returns a VocabularyProcessor object with the vocabulary
for the input iterator.
"""
vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
max_sentence_len,
min_frequency=min_frequency,
tokenizer_fn=tokenizer_fn)
vocab_processor.fit(input_iter)
return vocab_processor
#将单词序列转为单词ID序列
def transform_sentence(sequence, vocab_processor):
"""
Maps a single sentence into the integer vocabulary. Returns a python array.
"""
return next(vocab_processor.transform([sequence])).tolist()
def create_text_sequence_feature(fl, sentence, sentence_len, vocab):
"""
Writes a sentence to FeatureList protocol buffer
"""
sentence_transformed = transform_sentence(sentence, vocab)
for word_id in sentence_transformed:
fl.feature.add().int64_list.value.extend([word_id])
return fl
#创建训练数据的Example
def create_example_train(row, vocab):
"""
Creates a training example for the Ubuntu Dialog Corpus dataset.
Returnsthe a tensorflow.Example Protocol Buffer object.
"""
context, utterance, label = row
context_transformed = transform_sentence(context, vocab)
utterance_transformed = transform_sentence(utterance, vocab)
context_len = len(next(vocab._tokenizer([context])))
utterance_len = len(next(vocab._tokenizer([utterance])))
label = int(float(label))
# New Example
example = tf.train.Example()
#每个features里面包含多个feature
example.features.feature["context"].int64_list.value.extend(context_transformed)
example.features.feature["utterance"].int64_list.value.extend(utterance_transformed)
example.features.feature["context_len"].int64_list.value.extend([context_len])
example.features.feature["utterance_len"].int64_list.value.extend([utterance_len])
example.features.feature["label"].int64_list.value.extend([label])
return example
#创建测试和校验数据集的Example
def create_example_test(row, vocab):
"""
Creates a test/validation example for the Ubuntu Dialog Corpus dataset.
Returnsthe a tensorflow.Example Protocol Buffer object.
"""
context, utterance = row[:2]
distractors = row[2:]
context_len = len(next(vocab._tokenizer([context])))
utterance_len = len(next(vocab._tokenizer([utterance])))
context_transformed = transform_sentence(context, vocab)
utterance_transformed = transform_sentence(utterance, vocab)
# New Example
example = tf.train.Example()
example.features.feature["context"].int64_list.value.extend(context_transformed)
example.features.feature["utterance"].int64_list.value.extend(utterance_transformed)
example.features.feature["context_len"].int64_list.value.extend([context_len])
example.features.feature["utterance_len"].int64_list.value.extend([utterance_len])
# Distractor sequences
for i, distractor in enumerate(distractors):
dis_key = "distractor_{}".format(i)
dis_len_key = "distractor_{}_len".format(i)
# Distractor Length Feature
dis_len = len(next(vocab._tokenizer([distractor])))
example.features.feature[dis_len_key].int64_list.value.extend([dis_len])
# Distractor Text Feature
dis_transformed = transform_sentence(distractor, vocab)
example.features.feature[dis_key].int64_list.value.extend(dis_transformed)
return example
#将Example数据写成tfrecords文件
def create_tfrecords_file(input_filename, output_filename, example_fn):
"""
Creates a TFRecords file for the given input data and
example transofmration function
"""
writer = tf.python_io.TFRecordWriter(output_filename)
print("Creating TFRecords file at {}...".format(output_filename))
for i, row in enumerate(create_csv_iter(input_filename)):
x = example_fn(row)
writer.write(x.SerializeToString())
writer.close()
print("Wrote to {}".format(output_filename))
def write_vocabulary(vocab_processor, outfile):
"""
Writes the vocabulary to a file, one word per line.
"""
#vocab_processor.vocabulary_ 返回单词列表
vocab_size = len(vocab_processor.vocabulary_)
with open(outfile, "w") as vocabfile:
for id in range(vocab_size):
word = vocab_processor.vocabulary_._reverse_mapping[id]
vocabfile.write(word + "\n")
print("Saved vocabulary to {}".format(outfile))
import functools
import os
print("Creating vocabulary...")
input_iter = create_csv_iter(TRAIN_PATH)
input_iter = (x[0] + " " + x[1] for x in input_iter)
vocab = create_vocab(input_iter, min_frequency=min_word_frequency)
print("Total vocabulary size: {}".format(len(vocab.vocabulary_)))
# Create vocabulary.txt file
write_vocabulary(vocab, os.path.join(output_dir, "vocabulary.txt"))
# Save vocab processor
vocab.save(os.path.join(output_dir, "vocab_processor.bin"))
# Create validation.tfrecords
#functools.partial是偏序函数,假设def add(a,b,c):return a+b+c,p=functools.partial(add,12),则p(1,2)结果为15
create_tfrecords_file(
input_filename=VALIDATION_PATH,
output_filename=os.path.join(output_dir, "validation.tfrecords"),
example_fn=functools.partial(create_example_test, vocab=vocab))
# Create test.tfrecords
create_tfrecords_file(
input_filename=TEST_PATH,
output_filename=os.path.join(output_dir, "test.tfrecords"),
example_fn=functools.partial(create_example_test, vocab=vocab))
# Create train.tfrecords
create_tfrecords_file(
input_filename=TRAIN_PATH,
output_filename=os.path.join(output_dir, "train.tfrecords"),
example_fn=functools.partial(create_example_train, vocab=vocab))
2)读取tfrecords记录
TEXT_FEATURE_SIZE = 160
#创建特征列表
def get_feature_columns(mode):
feature_columns = []
#tf.contrib.layers.real_valued_column
feature_columns.append(tf.contrib.layers.real_valued_column(
column_name="context", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64))
feature_columns.append(tf.contrib.layers.real_valued_column(
column_name="context_len", dimension=1, dtype=tf.int64))
feature_columns.append(tf.contrib.layers.real_valued_column(
column_name="utterance", dimension=TEXT_FEATURE_SIZE, dtype=tf.int64))
feature_columns.append(tf.contrib.layers.real_valued_column(
column_name="utterance_len", dimension=1, dtype=tf.int64))
if mode == tf.contrib.learn.ModeKeys.TRAIN:
# During training we have a label feature
feature_columns.append(tf.contrib.layers.real_valued_column(
column_name="label", dimension=1, dtype=tf.int64))
if mode == tf.contrib.learn.ModeKeys.EVAL:
# During evaluation we have distractors
for i in range(9):
feature_columns.append(tf.contrib.layers.real_valued_column(
column_name="distractor_{}".format(i), dimension=TEXT_FEATURE_SIZE, dtype=tf.int64))
feature_columns.append(tf.contrib.layers.real_valued_column(
column_name="distractor_{}_len".format(i), dimension=1, dtype=tf.int64))
return set(feature_columns)
def create_input_fn(mode, input_files, batch_size, num_epochs):
def input_fn():
#从特征列解析特征配置
features = tf.contrib.layers.create_feature_spec_for_parsing(
get_feature_columns(mode))
#返回包含Tensor或者SparseTensor张量的字典
feature_map = tf.contrib.learn.io.read_batch_features(
file_pattern=input_files,
batch_size=batch_size,
features=features,
reader=tf.TFRecordReader,
randomize_input=True,
num_epochs=num_epochs,
queue_capacity=200000 + batch_size * 10,
name="read_batch_features_{}".format(mode))
# This is an ugly hack because of a current bug in tf.learn
# During evaluation TF tries to restore the epoch variable which isn't defined during training
# So we define the variable manually here
if mode == tf.contrib.learn.ModeKeys.TRAIN:
tf.get_variable(
"read_batch_features_eval/file_name_queue/limit_epochs/epochs",
initializer=tf.constant(0, dtype=tf.int64))
if mode == tf.contrib.learn.ModeKeys.TRAIN:
#dict pop方法返回给定key的值并将key删除
target = feature_map.pop("label")
else:
# In evaluation we have 10 classes (utterances).
# The first one (index 0) is always the correct one
target = tf.zeros([batch_size, 1], dtype=tf.int64)
return feature_map, target
return input_fn
from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
def create_evaluation_metrics():
eval_metrics = {}
for k in [1, 2, 5, 10]:
eval_metrics["recall_at_%d" % k] = MetricSpec(metric_fn=functools.partial(
tf.contrib.metrics.streaming_sparse_recall_at_k,
k=k))
return eval_metrics
3)创建模型
from collections import defaultdict
import array
import sys
vocab_size=1000
embedding_dim=100
glove_path=''
vocab_path=''
rnn_dim=10
learning_rate=0.01
optimizer=tf.train.AdadeltaOptimizer()
#加载词典
def load_vocab(filename):
vocab = None
with open(filename) as f:
vocab = f.read().splitlines()
dct = defaultdict(int)
for idx, word in enumerate(vocab):
dct[word] = idx
return [vocab, dct]
#加载词嵌入向量
def load_glove_vectors(filename, vocab):
"""
Load glove vectors from a .txt file.
Optionally limit the vocabulary to save memory. `vocab` should be a set.
"""
dct = {}
vectors = array.array('d')
current_idx = 0
with open(filename, "r", encoding="utf-8") as f:
for _, line in enumerate(f):
tokens = line.split(" ")
word = tokens[0]
entries = tokens[1:]
if not vocab or word in vocab:
dct[word] = current_idx
vectors.extend(float(x) for x in entries)
current_idx += 1
word_dim = len(entries)
num_vectors = len(dct)
tf.logging.info("Found {} out of {} vectors in Glove".format(num_vectors, len(vocab)))
return [np.array(vectors).reshape(num_vectors, word_dim), dct]
#加载词嵌入向量初始化张量
def build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, embedding_dim):
initial_embeddings = np.random.uniform(-0.25, 0.25, (len(vocab_dict), embedding_dim)).astype("float32")
for word, glove_word_idx in glove_dict.items():
word_idx = vocab_dict.get(word)
initial_embeddings[word_idx, :] = glove_vectors[glove_word_idx]
return initial_embeddings
#获取词嵌入张量
def get_embeddings(glove_path=None,vocab_path=None):
if glove_path and vocab_path:
tf.logging.info("Loading Glove embeddings...")
vocab_array, vocab_dict = load_vocab(vocab_path)
glove_vectors, glove_dict = load_glove_vectors(glove_path, vocab=set(vocab_array))
initializer = build_initial_embedding_matrix(vocab_dict, glove_dict, glove_vectors, embedding_dim)
else:
tf.logging.info("No glove/vocab path specificed, starting with random embeddings.")
initializer = tf.random_uniform_initializer(-0.25, 0.25)
return tf.get_variable(
"word_embeddings",
shape=[vocab_size, embedding_dim],
initializer=initializer)
#定义模型
def dual_encoder_model(
mode,
context,
context_len,
utterance,
utterance_len,
targets):
# Initialize embedidngs randomly or with pre-trained vectors if available
embeddings_W = get_embeddings(glove_path,vocab_path)
# Embed the context and the utterance
context_embedded = tf.nn.embedding_lookup(
embeddings_W, context, name="embed_context")
utterance_embedded = tf.nn.embedding_lookup(
embeddings_W, utterance, name="embed_utterance")
# Build the RNN
with tf.variable_scope("rnn") as vs:
# We use an LSTM Cell
#共用同一个RNN CELL
cell = tf.nn.rnn_cell.LSTMCell(
rnn_dim,
forget_bias=2.0,
use_peepholes=True,
state_is_tuple=True)
# Run the utterance and context through the RNN
rnn_outputs, rnn_states = tf.nn.dynamic_rnn(
cell,
tf.concat(0, [context_embedded, utterance_embedded]),
sequence_length=tf.concat(0, [context_len, utterance_len]),
dtype=tf.float32)
encoding_context, encoding_utterance = tf.split(0, 2, rnn_states.h)
with tf.variable_scope("prediction") as vs:
M = tf.get_variable("M",
shape=[rnn_dim, rnn_dim],
initializer=tf.truncated_normal_initializer())
# "Predict" a response: c * M
generated_response = tf.matmul(encoding_context, M)
#将矩阵的维数变为3
generated_response = tf.expand_dims(generated_response, 2)
encoding_utterance = tf.expand_dims(encoding_utterance, 2)
# Dot product between generated response and actual response
# (c * M) * r
#由于新版本不支持tf.batch_matmul,改为使用tf.matmul
#tf.matmul就是最内部的两维按照矩阵的乘法规则相乘,其他外围必须有相同的维数
#logits = tf.batch_matmul(generated_response, encoding_utterance, True)
logits=tf.matmul(generated_response, encoding_utterance)
#默认删除维数为1的维度,或者指定的纬度,本列删除axis=2的维度
logits = tf.squeeze(logits, [2])
# Apply sigmoid to convert logits to probabilities
probs = tf.sigmoid(logits)
if mode == tf.contrib.learn.ModeKeys.INFER:
return probs, None
# Calculate the binary cross-entropy loss
losses = tf.nn.sigmoid_cross_entropy_with_logits(logits, tf.to_float(targets))
# Mean loss across the batch of examples
mean_loss = tf.reduce_mean(losses, name="mean_loss")
return probs, mean_loss
def get_id_feature(features, key, len_key, max_len):
ids = features[key]
ids_len = tf.squeeze(features[len_key], [1])
ids_len = tf.minimum(ids_len, tf.constant(max_len, dtype=tf.int64))
return ids, ids_len
#创建优化操作
def create_train_op(loss):
train_op = tf.contrib.layers.optimize_loss(
loss=loss,
global_step=tf.contrib.framework.get_global_step(),
learning_rate=learning_rate,
clip_gradients=10.0,
optimizer=optimizer)
return train_op
def create_model_fn(model_impl):
def model_fn(features, targets, mode):
context, context_len = get_id_feature(
features, "context", "context_len", max_context_len)
utterance, utterance_len = get_id_feature(
features, "utterance", "utterance_len", max_utterance_len)
batch_size = targets.get_shape().as_list()[0]
if mode == tf.contrib.learn.ModeKeys.TRAIN:
probs, loss = model_impl(
mode,
context,
context_len,
utterance,
utterance_len,
targets)
train_op = create_train_op(loss)
return probs, loss, train_op
if mode == tf.contrib.learn.ModeKeys.INFER:
probs, loss = model_impl(
mode,
context,
context_len,
utterance,
utterance_len,
None)
return probs, 0.0, None
if mode == tf.contrib.learn.ModeKeys.EVAL:
# We have 10 exampels per record, so we accumulate them
all_contexts = [context]
all_context_lens = [context_len]
all_utterances = [utterance]
all_utterance_lens = [utterance_len]
all_targets = [tf.ones([batch_size, 1], dtype=tf.int64)]
for i in range(9):
distractor, distractor_len = get_id_feature(features,
"distractor_{}".format(i),
"distractor_{}_len".format(i),
max_utterance_len)
all_contexts.append(context)
all_context_lens.append(context_len)
all_utterances.append(distractor)
all_utterance_lens.append(distractor_len)
all_targets.append(
tf.zeros([batch_size, 1], dtype=tf.int64)
)
probs, loss = model_impl(
mode,
tf.concat(0, all_contexts),
tf.concat(0, all_context_lens),
tf.concat(0, all_utterances),
tf.concat(0, all_utterance_lens),
tf.concat(0, all_targets))
split_probs = tf.split(0, 10, probs)
shaped_probs = tf.concat(1, split_probs)
# Add summaries
tf.histogram_summary("eval_correct_probs_hist", split_probs[0])
tf.scalar_summary("eval_correct_probs_average", tf.reduce_mean(split_probs[0]))
tf.histogram_summary("eval_incorrect_probs_hist", split_probs[1])
tf.scalar_summary("eval_incorrect_probs_average", tf.reduce_mean(split_probs[1]))
return shaped_probs, loss, None
return model_fn
import time
MODEL_DIR='./runs'
loglevel=20
num_epochs=100
eval_every=2000
batch_size=1000
eval_batch_size=1000
TIMESTAMP = int(time.time())
TRAIN_FILE = os.path.abspath(os.path.join(input_dir, "train.tfrecords"))
VALIDATION_FILE = os.path.abspath(os.path.join(input_dir, "validation.tfrecords"))
tf.logging.set_verbosity(loglevel)
def main(unused_argv):
model_fn = create_model_fn(model_impl=dual_encoder_model)
estimator = tf.contrib.learn.Estimator(
model_fn=model_fn,
model_dir=MODEL_DIR,
config=tf.contrib.learn.RunConfig())
input_fn_train = create_input_fn(
mode=tf.contrib.learn.ModeKeys.TRAIN,
input_files=[TRAIN_FILE],
batch_size=batch_size,
num_epochs=num_epochs)
input_fn_eval = create_input_fn(
mode=tf.contrib.learn.ModeKeys.EVAL,
input_files=[VALIDATION_FILE],
batch_size=eval_batch_size,
num_epochs=1)
eval_metrics = create_evaluation_metrics()
eval_monitor = tf.contrib.learn.monitors.ValidationMonitor(
input_fn=input_fn_eval,
every_n_steps=eval_every,
metrics=eval_metrics)
estimator.fit(input_fn=input_fn_train, steps=None, monitors=[eval_monitor])