# 解析lstm（官方文件）原始碼

http://colah.github.io/posts/2015-08-Understanding-LSTMs/ 一篇很容易理解的lstm的文件。

``````if __name__ == '__main__':
# See function train for all possible parameter and there definition.
train_lstm(
max_epochs=100,
test_size=500,
)``````

``````model_options = locals().copy()
print("model options", model_options)``````

`````` train, valid, test = load_data(n_words=n_words, valid_portion=0.05,
maxlen=maxlen)``````

``````def load_data(path="imdb.pkl", n_words=100000, valid_portion=0.1, maxlen=None,
sort_by_len=True):
path = get_dataset_file(
path, "imdb.pkl",
"http://www.iro.umontreal.ca/~lisa/deep/data/imdb.pkl")
if path.endswith(".gz"):
f = gzip.open(path, 'rb')
else:
f = open(path, 'rb')
f.close()
if maxlen:
new_train_set_x = []
new_train_set_y = []
for x, y in zip(train_set[0], train_set[1]):
if len(x) < maxlen:
new_train_set_x.append(x)
new_train_set_y.append(y)
train_set = (new_train_set_x, new_train_set_y)
del new_train_set_x, new_train_set_y
# split training set into validation set
train_set_x, train_set_y = train_set
n_samples = len(train_set_x)
sidx = numpy.random.permutation(n_samples)
n_train = int(numpy.round(n_samples * (1. - valid_portion)))
valid_set_x = [train_set_x[s] for s in sidx[n_train:]]
valid_set_y = [train_set_y[s] for s in sidx[n_train:]]
train_set_x = [train_set_x[s] for s in sidx[:n_train]]
train_set_y = [train_set_y[s] for s in sidx[:n_train]]
def len_argsort(seq):
return sorted(range(len(seq)), key=lambda x: len(seq[x]))
if sort_by_len:
sorted_index = len_argsort(test_set_x)
test_set_x = [test_set_x[i] for i in sorted_index]
test_set_y = [test_set_y[i] for i in sorted_index]
sorted_index = len_argsort(valid_set_x)
valid_set_x = [valid_set_x[i] for i in sorted_index]
valid_set_y = [valid_set_y[i] for i in sorted_index]
sorted_index = len_argsort(train_set_x)
train_set_x = [train_set_x[i] for i in sorted_index]
train_set_y = [train_set_y[i] for i in sorted_index]
train = (train_set_x, train_set_y)
valid = (valid_set_x, valid_set_y)
test = (test_set_x, test_set_y)
return train, valid, test
train_set = (train_set_x, train_set_y)
valid_set = (valid_set_x, valid_set_y)``````

remove_unk(x)把詞num超過定值的置1。sort_by_len:將句子按照長度大小重新排序。

``````(use_noise, x, mask,
y, f_pred_prob, f_pred, cost) = build_model(tparams, model_options)``````

``````emb = tparams['Wemb'][x.flatten()].reshape([n_timesteps,
n_samples,
options['dim_proj']])
proj = get_layer(options['encoder'])[1](tparams, emb, options,
prefix=options['encoder'],
if options['encoder'] == 'lstm':
proj = (proj * mask[:, :, None]).sum(axis=0)
proj = proj / mask.sum(axis=0)[:, None]
if options['use_dropout']:
proj = dropout_layer(proj, use_noise, trng)
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])   tparams['b'])
f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
off = 1e-8
if pred.dtype == 'float16':
off = 1e-6
cost = -tensor.log(pred[tensor.arange(n_samples), y]   off).mean()
return use_noise, x, mask, y, f_pred_prob, f_pred, cost``````

1、首先tparams[‘Wemb’]裡面存的是詞向量（每個詞一個dim_proj，預設呼叫了），通過[x.flatten()].reshape延展成了一個3d矩陣（maxlen最長句子的單詞數，bath_size 每次掃描的單詞數，dim_proj詞向量／lstm hidden layer的個數）
ps.x的值來自於imdb.py裡的prepare_data返回值

2、proj = get_layer(options[‘encoder’])[1]，這裡呼叫的是
def lstm_layer(tparams, state_below, options, prefix=’lstm’, mask=None): 這個函式比較複雜，填一段程式碼解析：

``````def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
assert mask is not None
def _slice(_x, n, dim):
if _x.ndim == 3:
return _x[:, :, n * dim:(n   1) * dim]
return _x[:, n * dim:(n   1) * dim]
def _step(m_, x_, h_, c_):
preact = tensor.dot(h_, tparams[_p(prefix, 'U')])
preact  = x_
i = tensor.nnet.sigmoid(_slice(preact, 0, options['dim_proj']))
f = tensor.nnet.sigmoid(_slice(preact, 1, options['dim_proj']))
o = tensor.nnet.sigmoid(_slice(preact, 2, options['dim_proj']))
c = tensor.tanh(_slice(preact, 3, options['dim_proj']))
c = f * c_   i * c
c = m_[:, None] * c   (1. - m_)[:, None] * c_
h = o * tensor.tanh(c)
h = m_[:, None] * h   (1. - m_)[:, None] * h_
return h, c
state_below = (tensor.dot(state_below, tparams[_p(prefix, 'W')])
tparams[_p(prefix, 'b')])
dim_proj = options['dim_proj']
rval, updates = theano.scan(_step,
outputs_info=[tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj),
tensor.alloc(numpy_floatX(0.),
n_samples,
dim_proj)],
name=_p(prefix, '_layers'),
n_steps=nsteps)
return rval[0]``````

2(2)、每一步step裡計算的就是根據lstm的結構定義計算input gate, forget gate, output gate, h, c, 需要注意的是，在def param_init_lstm(options, params, prefix=’lstm’):中w,u,b引數都被定義4*dim_proj 列 這是因為i,~c,f和o這4個引數可以平行計算（根據模型的公式）

c = m_[:, None] * c (1. – m_)[:, None] * c_
h = m_[:, None] * h (1. – m_)[:, None] * h_

2(4)、nsteps完成後，返回rvals[0]，這是一個nsteps的h的3維矩陣

``````    if options['encoder'] == 'lstm':
proj = (proj * mask[:, :, None]).sum(axis=0)
proj = proj / mask.sum(axis=0)[:, None]
if options['use_dropout']:
proj = dropout_layer(proj, use_noise, trng)
pred = tensor.nnet.softmax(tensor.dot(proj, tparams['U'])   tparams['b'])
f_pred_prob = theano.function([x, mask], pred, name='f_pred_prob')
f_pred = theano.function([x, mask], pred.argmax(axis=1), name='f_pred')
off = 1e-8
if pred.dtype == 'float16':
off = 1e-6
cost = -tensor.log(pred[tensor.arange(n_samples), y]   off).mean()
return use_noise, x, mask, y, f_pred_prob, f_pred, cost``````

nsteps的三維h經過降維meanpooling後與u,b運算，注意這裡的u,b同lstm裡面的u,b不同，返回一個f_pred函式作為預測函式

``````f_cost = theano.function([x, mask, y], cost, name='f_cost')
lr = tensor.scalar(name='lr')
x, mask, y, cost)``````

``````kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)``````

get_minibatches_idx用於將送入的n個句子按照minibatch進行劃分

``````        for eidx in range(max_epochs):
n_samples = 0
# Get new shuffled index for the training set.
kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
``````

`` for _, train_index in kf:``

`````` if numpy.mod(uidx, dispFreq) == 0:
if saveto and numpy.mod(uidx, saveFreq) == 0:
``````

``````                if numpy.mod(uidx, validFreq) == 0:
use_noise.set_value(0.)
train_err = pred_error(f_pred, prepare_data, train, kf)
valid_err = pred_error(f_pred, prepare_data, valid,
kf_valid)
test_err = pred_error(f_pred, prepare_data, test, kf_test)
history_errs.append([valid_err, test_err])
if (best_p is None or
valid_err <= numpy.array(history_errs)[:,
0].min()):
best_p = unzip(tparams)
print( ('Train ', train_err, 'Valid ', valid_err,
'Test ', test_err) )
if (len(history_errs) > patience and
valid_err >= numpy.array(history_errs)[:-patience,
0].min()):