实践 | 文本生成
厉害了,智慧有那么大
·
文本生成
文本生成介绍
序列对序列
注意力机制
1.加载开发环境
import paddle
import paddle.nn.functional as F
import re
import numpy as np
print(paddle.__version__)
# cpu/gpu环境选择,在 paddle.set_device() 输入对应运行设备。
# device = paddle.set_device('gpu')
2. 统计数据集信息,确定句子长度
我们采用包含90%句子长度的长度值作为句子的长度
# 统计数据集中句子的长度等信息
lines = open('data/data78721/cmn.txt','r',encoding='utf-8').readlines()
print(len(lines))
datas = []
dic_en = {}
dic_cn = {}
for line in lines:
ll = line.strip().split('\t')
if len(ll)<2:
continue
datas.append([ll[0].lower().split(' ')[1:-1],list(ll[1])])
# print(ll[0])
if len(ll[0].split(' ')) not in dic_en:
dic_en[len(ll[0].split(' '))] = 1
else:
dic_en[len(ll[0].split(' '))] +=1
if len(ll[1]) not in dic_cn:
dic_cn[len(ll[1])] = 1
else:
dic_cn[len(ll[1])] +=1
keys_en = list(dic_en.keys())
keys_en.sort()
count = 0
# print('英文长度统计:')
for k in keys_en:
count += dic_en[k]
# print(k,dic_en[k],count/len(lines))
keys_cn = list(dic_cn.keys())
keys_cn.sort()
count = 0
# print('中文长度统计:')
for k in keys_cn:
count += dic_cn[k]
# print(k,dic_cn[k],count/len(lines))
en_length = 10
cn_length = 10
3. 构建中文词表、英文词表
# 构建中英文词表
en_vocab = {}
cn_vocab = {}
en_vocab['<pad>'], en_vocab['<bos>'], en_vocab['<eos>'] = 0, 1, 2
cn_vocab['<pad>'], cn_vocab['<bos>'], cn_vocab['<eos>'] = 0, 1, 2
en_idx, cn_idx = 3, 3
for en, cn in datas:
# print(en,cn)
for w in en:
if w not in en_vocab:
en_vocab[w] = en_idx
en_idx += 1
for w in cn:
if w not in cn_vocab:
cn_vocab[w] = cn_idx
cn_idx += 1
print(len(list(en_vocab)))
print(len(list(cn_vocab)))
'''
英文词表长度:6057
中文词表长度:3533
'''
4. 创建数据集
接下来根据词表,我们将会创建一份实际的用于训练的用numpy array组织起来的数据集。
- 所有的句子都通过补充成为了长度相同的句子。
- 对于英文句子(源语言),我们将其反转了过来,这会带来更好的翻译的效果。
- 所创建的padded_cn_label_sents是训练过程中的预测的目标,即,每个中文的当前词去预测下一个词是什么词。
padded_en_sents = []
padded_cn_sents = []
padded_cn_label_sents = []
for en, cn in datas:
if len(en)>en_length:
en = en[:en_length]
if len(cn)>cn_length:
cn = cn[:cn_length]
padded_en_sent = en + ['<eos>'] + ['<pad>'] * (en_length - len(en))
padded_en_sent.reverse()
padded_cn_sent = ['<bos>'] + cn + ['<eos>'] + ['<pad>'] * (cn_length - len(cn))
padded_cn_label_sent = cn + ['<eos>'] + ['<pad>'] * (cn_length - len(cn) + 1)
padded_en_sents.append(np.array([en_vocab[w] for w in padded_en_sent]))
padded_cn_sents.append(np.array([cn_vocab[w] for w in padded_cn_sent]) )
padded_cn_label_sents.append(np.array([cn_vocab[w] for w in padded_cn_label_sent]))
train_en_sents = np.array(padded_en_sents)
train_cn_sents = np.array(padded_cn_sents)
train_cn_label_sents = np.array(padded_cn_label_sents)
print(train_en_sents.shape)
print(train_cn_sents.shape)
print(train_cn_label_sents.shape)
5.构建基于Transformer的机器翻译模型
首先定义超参数,用于后续模型的设计与训练
embedding_size = 128
hidden_size = 512
num_encoder_lstm_layers = 1
en_vocab_size = len(list(en_vocab))
cn_vocab_size = len(list(cn_vocab))
epochs = 20
batch_size = 16
使用TransformerEncoder定义Encoder
# encoder: simply learn representation of source sentence
class Encoder(paddle.nn.Layer):
def __init__(self,en_vocab_size, embedding_size,num_layers=2,head_number=2,middle_units=512):
super(Encoder, self).__init__()
self.emb = paddle.nn.Embedding(en_vocab_size, embedding_size,)
"""
d_model (int) - 输入输出的维度。
nhead (int) - 多头注意力机制的Head数量。
dim_feedforward (int) - 前馈神经网络中隐藏层的大小。
"""
encoder_layer = paddle.nn.TransformerEncoderLayer(embedding_size, head_number, middle_units)
self.encoder = paddle.nn.TransformerEncoder(encoder_layer, num_layers)
def forward(self, x):
x = self.emb(x)
en_out = self.encoder(x)
return en_out
使用TransformerDecoder定义Decoder
class Decoder(paddle.nn.Layer):
def __init__(self,cn_vocab_size, embedding_size,num_layers=2,head_number=2,middle_units=512):
super(Decoder, self).__init__()
self.emb = paddle.nn.Embedding(cn_vocab_size, embedding_size)
decoder_layer = paddle.nn.TransformerDecoderLayer(embedding_size, head_number, middle_units)
self.decoder = paddle.nn.TransformerDecoder(decoder_layer, num_layers)
# for computing output logits
self.outlinear =paddle.nn.Linear(embedding_size, cn_vocab_size)
def forward(self, x, encoder_outputs):
x = self.emb(x)
# dec_input, enc_output,self_attn_mask, cross_attn_mask
de_out = self.decoder(x, encoder_outputs)
output = self.outlinear(de_out)
output = paddle.squeeze(output)
return output
训练模型
encoder = Encoder(en_vocab_size, embedding_size)
decoder = Decoder(cn_vocab_size, embedding_size)
opt = paddle.optimizer.Adam(learning_rate=0.0001,
parameters=encoder.parameters() + decoder.parameters())
for epoch in range(epochs):
print("epoch:{}".format(epoch))
# shuffle training data
perm = np.random.permutation(len(train_en_sents))
train_en_sents_shuffled = train_en_sents[perm]
train_cn_sents_shuffled = train_cn_sents[perm]
train_cn_label_sents_shuffled = train_cn_label_sents[perm]
# print(train_en_sents_shuffled.shape[0],train_en_sents_shuffled.shape[1])
for iteration in range(train_en_sents_shuffled.shape[0] // batch_size):
x_data = train_en_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
sent = paddle.to_tensor(x_data)
en_repr = encoder(sent)
x_cn_data = train_cn_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
x_cn_label_data = train_cn_label_sents_shuffled[(batch_size*iteration):(batch_size*(iteration+1))]
loss = paddle.zeros([1])
for i in range( cn_length + 2):
cn_word = paddle.to_tensor(x_cn_data[:,i:i+1])
cn_word_label = paddle.to_tensor(x_cn_label_data[:,i])
logits = decoder(cn_word, en_repr)
step_loss = F.cross_entropy(logits, cn_word_label)
loss += step_loss
loss = loss / (cn_length + 2)
if(iteration % 50 == 0):
print("iter {}, loss:{}".format(iteration, loss.numpy()))
loss.backward()
opt.step()
opt.clear_grad()
6. 使用上述训练好的模型进行测试
encoder.eval()
decoder.eval()
num_of_exampels_to_evaluate = 10
indices = np.random.choice(len(train_en_sents), num_of_exampels_to_evaluate, replace=False)
x_data = train_en_sents[indices]
sent = paddle.to_tensor(x_data)
en_repr = encoder(sent)
word = np.array(
[[cn_vocab['<bos>']]] * num_of_exampels_to_evaluate
)
word = paddle.to_tensor(word)
decoded_sent = []
for i in range(cn_length + 2):
logits = decoder(word, en_repr)
word = paddle.argmax(logits, axis=1)
decoded_sent.append(word.numpy())
word = paddle.unsqueeze(word, axis=-1)
results = np.stack(decoded_sent, axis=1)
for i in range(num_of_exampels_to_evaluate):
print('---------------------')
en_input = " ".join(datas[indices[i]][0])
ground_truth_translate = "".join(datas[indices[i]][1])
model_translate = ""
for k in results[i]:
w = list(cn_vocab)[k]
if w != '<pad>' and w != '<eos>':
model_translate += w
print(en_input)
print("true: {}".format(ground_truth_translate))
print("pred: {}".format(model_translate))

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐
所有评论(0)