• pd.read_csv

详解pandas的read_csv方法 - 知乎 (zhihu.com)

  • 代码
# otto-group-product-classification-challenge
import numpy as np
import torch
import torch.optim as optim  # 优化器
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import pandas as pd


# 数据预处理
# 定义函数将类别标签转为id表示,方便后面计算交叉熵
def labelsId(labels):
    target_id = []  # 给所有target建立一个词典
    target_labels = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
    for label in labels:
        target_id.append(target_labels.index(label))
    return target_id


# 设计数据类
class OttoGroupDataset(Dataset):
    def __init__(self, filepath):
        data = pd.read_csv(filepath)
        labels = data['target']
        self.len = data.shape[0]

        # 处理特征和标签
        self.x_data = torch.Tensor(np.array(data)[:, 1:-1].astype(float))  # 选择[2,倒数第二]列
        self.y_data = labelsId(labels)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len


# 设计模型
class OttoGroupModel(torch.nn.Module):
    def __init__(self):
        super(OttoGroupModel, self).__init__()
        self.linear1 = torch.nn.Linear(93, 64)
        self.linear2 = torch.nn.Linear(64, 32)
        self.linear3 = torch.nn.Linear(32, 16)
        self.linear4 = torch.nn.Linear(16, 9)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.relu(self.linear3(x))
        # 最后一层不做激活,不进行非线性变换
        return self.linear4(x)  # 最后一层不做激活,不进行非线性变换

    # 预测函数
    def predict(self, x):
        with torch.no_grad():
            x = self.relu(self.linear1(x))
            x = self.relu(self.linear2(x))
            x = self.relu(self.linear3(x))
            x = self.relu(self.linear4(x))
            _, predicted = torch.max(x, dim=1)
            # 将预测的类别转为one-hot表示,方便保存为预测文件。
            y = pd.get_dummies(predicted).astype(int)
            return y


# 训练
def train(epoch):
    running_loss = 0.0
    for batch_idx, (inputs, target) in enumerate(train_loader, 0):
        inputs = inputs.float()
        outputs = model(inputs)
        loss = criterion(outputs, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if batch_idx % 300 == 299:
            print('[%d, %5d] loss:%.3f' % (epoch + 1, batch_idx + 1, running_loss / 300))
            running_loss = 0.0


# 准备数据集
train_dataset = OttoGroupDataset('otto-group-product-classification-challenge/train.csv')
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=64, num_workers=0)

# 初始化模型、损失函数和优化器
model = OttoGroupModel()
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.5)


if __name__ == '__main__':
    for epoch in range(10):
        train(epoch)


# 输出预测文件
def predict_save():
    test_data = pd.read_csv('otto-group-product-classification-challenge/test.csv')
    test_inputs = torch.Tensor(np.array(test_data)[:, 1:])
    out = model.predict(test_inputs)

    # 定义结果标签
    labels = ['Class_1', 'Class_2', 'Class_3', 'Class_4', 'Class_5', 'Class_6', 'Class_7', 'Class_8', 'Class_9']
    # 修改列标签
    out.columns = labels
    # 插入id行
    out.insert(0, 'id', test_data['id'])
    # 输出为文件
    output = pd.DataFrame(out)
    output.to_csv('predict.csv', index=False)


predict_save()
  • 题目链接

Otto Group Product Classification Challenge | Kaggle

  • 结果

Kaggle提交显示分数为0.74880

Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐