Fasttext、TextCNN和BERT三种算法对AG_news数据的文本分类实现比较

Fasttext、TextCNN和BERT三种算法对AG_news数据的文本分类实现比较
在自然语言处理领域中,文本分类是一项基础而重要的任务。为了验证不同算法在文本分类任务中的表现,我们使用了AG_news数据集,并分别使用Fasttext、TextCNN和BERT三种算法进行实现。本文将分别介绍三种算法的实现过程,并对比它们在AG_news数据集上的表现。
数据集介绍
AG_news数据集包含120000条新闻文本,分为四个类别:World、Sports、Business和Sci/Tech。其中每个类别包含30000条新闻文本,每条新闻文本包含标题和正文。我们使用了其中的标题作为分类的文本数据。
Fasttext实现
Fasttext是一种基于n-gram的文本分类算法,它的核心思想是将文本表示成一个词袋模型,并使用n-gram构建特征表示。Fasttext还引入了层级Softmax和负采样等技术,可以高效地处理大规模文本数据。
在AG_news数据集上,我们使用Fasttext进行文本分类,代码如下:
import fasttext
# 加载训练数据和测试数据
train_data = 'ag_news.train'
test_data = 'ag_news.test'
model_path = 'fasttext_model.bin'
# 训练模型
model = fasttext.train_supervised(input=train_data, epoch=10, lr=1.0, wordNgrams=2, bucket=200000, dim=50, loss='ova')
# 保存模型
model.save_model(model_path)
# 测试模型
result = model.test(test_data)
print(result)
TextCNN实现
TextCNN是一种卷积神经网络模型,它的核心思想是使用卷积层对文本进行特征提取,并使用池化层对特征进行下采样,最后使用全连接层进行分类。TextCNN的优点是可以在不同长度的文本上进行分类,并且可以自动学习文本的局部特征。
在AG_news数据集上,我们使用TextCNN进行文本分类,代码如下:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
# 定义模型
class TextCNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[0], embedding_dim))
self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[1], embedding_dim))
self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[2], embedding_dim))
self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
embedded = embedded.unsqueeze(1)
conved_0 = nn.functional.relu(self.conv_0(embedded).squeeze(3))
conved_1 = nn.functional.relu(self.conv_1(embedded).squeeze(3))
conved_2 = nn.functional.relu(self.conv_2(embedded).squeeze(3))
pooled_0 = nn.functional.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
pooled_1 = nn.functional.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
pooled_2 = nn.functional.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
return self.fc(cat)
# 定义超参数
INPUT_DIM = 10000
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 4
DROPOUT = 0.5
# 加载数据
train_data = 'ag_news.train'
test_data = 'ag_news.test'
# 预处理数据
TEXT = torchtext.legacy.data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)
train_data, test_data = torchtext.legacy.datasets.AG_NEWS.splits(TEXT, LABEL, root='data')
MAX_VOCAB_SIZE = 10000
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)
# 创建迭代器
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
(train_data, test_data), batch_size=BATCH_SIZE, device=device)
# 创建模型
model = TextCNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
# 训练模型
def train(model, iterator, optimizer, criterion):
epoch_loss = 0
epoch_acc = 0
model.train()
for batch in iterator:
optimizer.zero_grad()
text, text_lengths = batch.text
predictions = model(text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = accuracy(predictions, batch.label)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 测试模型
def evaluate(model, iterator, criterion):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for batch in iterator:
text, text_lengths = batch.text
predictions = model(text).squeeze(1)
loss = criterion(predictions, batch.label)
acc = accuracy(predictions, batch.label)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
# 计算准确率
def accuracy(predictions, y):
_, predicted = torch.max(predictions, 1)
correct = (predicted == y).float()
accuracy = correct.sum() / len(correct)
return accuracy
N_EPOCHS = 10
best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'textcnn_model.pt')
print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc100:.2f}%')
# 加载模型
model.load_state_dict(torch.load('textcnn_model.pt'))
# 测试模型
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')
BERT实现
BERT是一种基于Transformer的深度学习模型,它可以自动学习上下文信息,并具有很强的表征能力。BERT在自然语言处理领域取得了很多突破性的成果,包括文本分类、语言生成和问答系统等任务。
在AG_news数据集上,我们使用预训练的BERT模型进行文本分类,代码如下:
import transformers
import torch
# 加载预训练模型和分词器
model_name = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertForSequenceClassification.from_pretrained(model_name, num_labels=4)
# 定义超参数
MAX_SEQ_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 2
# 加载数据
train_data = 'ag_news.train'
test_data = 'ag_news.test'
# 预处理数据
def preprocess_data(text):
encoded = tokenizer.encode_plus(
text,
max_length=MAX_SEQ_LENGTH,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt'
)
return encoded['input_ids'], encoded['attention_mask']
def preprocess_dataset(dataset):
input_ids = []
attention_masks = []
labels = []
for example in dataset:
input_id, attention_mask = preprocess_data(example.text)
input_ids.append(input_id)
attention_masks.append(attention_mask)
labels.append(example.label)
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)
return input_ids, attention_masks, labels
train_dataset, test_dataset = torchtext.legacy.datasets.AG_NEWS.splits(TEXT, LABEL, root='data')
train_input_ids, train_attention_masks, train_labels = preprocess_dataset(train_dataset)
test_input_ids, test_attention_masks, test_labels = preprocess_dataset(test_dataset)
# 创建数据加载器
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
# 创建优化器和损失函数
optimizer = transformers.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)
for epoch in range(NUM_EPOCHS):
model.train()
train_loss = 0
train_correct = 0
for input_ids, attention_masks, labels in train_loader:
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
loss = outputs[0]
loss.backward()
optimizer.step()
train_loss += loss.item() * len(input_ids)
train_correct += torch.sum(torch.argmax(outputs[1], dim=1) == labels)
train_loss /= len(train_dataset)
train_accuracy = train_correct.double() / len(train_dataset)
print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_accuracy*100:.2f}%')
# 测试模型
model.eval()
test_loss = 0
test_correct = 0
with torch.no_grad():
for input_ids, attention_masks, labels in test_loader:
input_ids = input_ids.to(device)
attention_masks = attention_masks.to(device)
labels = labels.to(device)
outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
loss = outputs[0]
test_loss += loss.item() * len(input_ids)
test_correct += torch.sum(torch.argmax(outputs[1], dim=1) == labels)
test_loss /= len(test_dataset)
test_accuracy = test_correct.double() / len(test_dataset)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_accuracy*100:.2f}%')
魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐


所有评论(0)