Fasttext、TextCNN和BERT三种算法对AG_news数据的文本分类实现比较

Chaos_Wang_

2274人浏览 · 2023-03-21 23:17:25

Chaos_Wang_ · 2023-03-21 23:17:25 发布

❤️觉得内容不错的话，欢迎点赞收藏加关注😊😊😊，后续会继续输入更多优质内容❤️
👉有问题欢迎大家加关注私戳或者评论（包括但不限于NLP算法相关，linux学习相关，读研读博相关......）👈

Fasttext、TextCNN和BERT

（封面图由文心一格生成）

Fasttext、TextCNN和BERT三种算法对AG_news数据的文本分类实现比较

在自然语言处理领域中，文本分类是一项基础而重要的任务。为了验证不同算法在文本分类任务中的表现，我们使用了AG_news数据集，并分别使用Fasttext、TextCNN和BERT三种算法进行实现。本文将分别介绍三种算法的实现过程，并对比它们在AG_news数据集上的表现。

数据集介绍

AG_news数据集包含120000条新闻文本，分为四个类别：World、Sports、Business和Sci/Tech。其中每个类别包含30000条新闻文本，每条新闻文本包含标题和正文。我们使用了其中的标题作为分类的文本数据。

Fasttext实现

Fasttext是一种基于n-gram的文本分类算法，它的核心思想是将文本表示成一个词袋模型，并使用n-gram构建特征表示。Fasttext还引入了层级Softmax和负采样等技术，可以高效地处理大规模文本数据。

在AG_news数据集上，我们使用Fasttext进行文本分类，代码如下：

import fasttext

# 加载训练数据和测试数据
train_data = 'ag_news.train'
test_data = 'ag_news.test'
model_path = 'fasttext_model.bin'

# 训练模型
model = fasttext.train_supervised(input=train_data, epoch=10, lr=1.0, wordNgrams=2, bucket=200000, dim=50, loss='ova')

# 保存模型
model.save_model(model_path)

# 测试模型
result = model.test(test_data)
print(result)

TextCNN实现

TextCNN是一种卷积神经网络模型，它的核心思想是使用卷积层对文本进行特征提取，并使用池化层对特征进行下采样，最后使用全连接层进行分类。TextCNN的优点是可以在不同长度的文本上进行分类，并且可以自动学习文本的局部特征。

在AG_news数据集上，我们使用TextCNN进行文本分类，代码如下：

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# 定义模型
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[1], embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[2], embedding_dim))
		self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
		self.dropout = nn.Dropout(dropout)
	def forward(self, text):
	    embedded = self.embedding(text)
	    embedded = embedded.unsqueeze(1)
	    conved_0 = nn.functional.relu(self.conv_0(embedded).squeeze(3))
	    conved_1 = nn.functional.relu(self.conv_1(embedded).squeeze(3))
	    conved_2 = nn.functional.relu(self.conv_2(embedded).squeeze(3))
	    pooled_0 = nn.functional.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
	    pooled_1 = nn.functional.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
	    pooled_2 = nn.functional.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
	    cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
	    return self.fc(cat)
# 定义超参数
INPUT_DIM = 10000
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 4
DROPOUT = 0.5

# 加载数据
train_data = 'ag_news.train'
test_data = 'ag_news.test'

# 预处理数据
TEXT = torchtext.legacy.data.Field(tokenize='spacy', tokenizer_language='en_core_web_sm', include_lengths=True)
LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)
train_data, test_data = torchtext.legacy.datasets.AG_NEWS.splits(TEXT, LABEL, root='data')
MAX_VOCAB_SIZE = 10000
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

# 创建迭代器
BATCH_SIZE = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, test_iterator = torchtext.legacy.data.BucketIterator.splits(
(train_data, test_data), batch_size=BATCH_SIZE, device=device)

# 创建模型
model = TextCNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)

# 训练模型
def train(model, iterator, optimizer, criterion):
	epoch_loss = 0
	epoch_acc = 0
	model.train()
	for batch in iterator:
		optimizer.zero_grad()
		text, text_lengths = batch.text
		predictions = model(text).squeeze(1)
		loss = criterion(predictions, batch.label)
		acc = accuracy(predictions, batch.label)
		loss.backward()
		optimizer.step()
		epoch_loss += loss.item()
		epoch_acc += acc.item()
	return epoch_loss / len(iterator), epoch_acc / len(iterator)

# 测试模型
def evaluate(model, iterator, criterion):
	epoch_loss = 0
	epoch_acc = 0
	model.eval()
	with torch.no_grad():
		for batch in iterator:
			text, text_lengths = batch.text
			predictions = model(text).squeeze(1)
			loss = criterion(predictions, batch.label)
			acc = accuracy(predictions, batch.label)
			epoch_loss += loss.item()
			epoch_acc += acc.item()
	return epoch_loss / len(iterator), epoch_acc / len(iterator)

# 计算准确率
def accuracy(predictions, y):
	_, predicted = torch.max(predictions, 1)
	correct = (predicted == y).float()
	accuracy = correct.sum() / len(correct)
	return accuracy

N_EPOCHS = 10
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
	train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
	valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
	if valid_loss < best_valid_loss:
		best_valid_loss = valid_loss
		torch.save(model.state_dict(), 'textcnn_model.pt')
	print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc100:.2f}%')
# 加载模型
model.load_state_dict(torch.load('textcnn_model.pt'))

# 测试模型
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc*100:.2f}%')

BERT实现

BERT是一种基于Transformer的深度学习模型，它可以自动学习上下文信息，并具有很强的表征能力。BERT在自然语言处理领域取得了很多突破性的成果，包括文本分类、语言生成和问答系统等任务。

在AG_news数据集上，我们使用预训练的BERT模型进行文本分类，代码如下：

import transformers
import torch

# 加载预训练模型和分词器
model_name = 'bert-base-uncased'
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)
model = transformers.BertForSequenceClassification.from_pretrained(model_name, num_labels=4)

# 定义超参数
MAX_SEQ_LENGTH = 128
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
NUM_EPOCHS = 2

# 加载数据
train_data = 'ag_news.train'
test_data = 'ag_news.test'

# 预处理数据
def preprocess_data(text):
    encoded = tokenizer.encode_plus(
        text,
        max_length=MAX_SEQ_LENGTH,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded['input_ids'], encoded['attention_mask']

def preprocess_dataset(dataset):
    input_ids = []
    attention_masks = []
    labels = []
    for example in dataset:
        input_id, attention_mask = preprocess_data(example.text)
        input_ids.append(input_id)
        attention_masks.append(attention_mask)
        labels.append(example.label)
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels

train_dataset, test_dataset = torchtext.legacy.datasets.AG_NEWS.splits(TEXT, LABEL, root='data')
train_input_ids, train_attention_masks, train_labels = preprocess_dataset(train_dataset)
test_input_ids, test_attention_masks, test_labels = preprocess_dataset(test_dataset)

# 创建数据加载器
train_dataset = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
# 创建优化器和损失函数
optimizer = transformers.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
# 训练模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)
for epoch in range(NUM_EPOCHS):
	model.train()
	train_loss = 0
	train_correct = 0
	for input_ids, attention_masks, labels in train_loader:
		input_ids = input_ids.to(device)
		attention_masks = attention_masks.to(device)
		labels = labels.to(device)
		optimizer.zero_grad()
		outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
		loss = outputs[0]
		loss.backward()
		optimizer.step()
		train_loss += loss.item() * len(input_ids)
		train_correct += torch.sum(torch.argmax(outputs[1], dim=1) == labels)
		train_loss /= len(train_dataset)
		train_accuracy = train_correct.double() / len(train_dataset)
		print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_accuracy*100:.2f}%')

# 测试模型
model.eval()
test_loss = 0
test_correct = 0
with torch.no_grad():
	for input_ids, attention_masks, labels in test_loader:
		input_ids = input_ids.to(device)
		attention_masks = attention_masks.to(device)
		labels = labels.to(device)
		outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
		loss = outputs[0]
		test_loss += loss.item() * len(input_ids)
		test_correct += torch.sum(torch.argmax(outputs[1], dim=1) == labels)
		test_loss /= len(test_dataset)
		test_accuracy = test_correct.double() / len(test_dataset)
		print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_accuracy*100:.2f}%')

魔乐社区

魔乐社区（Modelers.cn) 是一个中立、公益的人工智能社区，提供人工智能工具、模型、数据的托管、展示与应用协同服务，为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作，由全产业链共同建设、共同运营、共同享有，推动国产AI生态繁荣发展。

更多推荐

小参数・大码力・易部署 | Qwen3.6-27B上线魔乐社区，基于昇腾的部署教程来了

继一周前模型开源发布后，千问再度开源Qwen3.6-27B —— 一个拥有270亿参数的稠密多模态模型，也是社区呼声最高的模型规格。Qwen3.6-27B 依然支持多模态思考与非思考模式，在智能体编程方面达到了旗舰级表现，全面超越前代开源旗舰 Qwen3.5-397B-A17B（总参数397B / 激活参数17B的MoE模型）。作为稠密架构，它无需MoE路由即可部署，是开发者在实用、可广泛部署规模