import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

data_frame = pd.read_csv('./data/criteo_sample.txt')

# C开头的列,值都是乱码,所以按枚举 转为id
sparse_feature_names = ['C' + str(i) for i in range(1, 27)] # sparse的意思,会转为id然后embedding
# I开头的列,是有具体数字值的,所以转为0~1的float
dense_feature_names = ['I' + str(i) for i in range(1, 14)] # dense的意思,会转为一个0~1的float

data_frame[sparse_feature_names] = data_frame[sparse_feature_names].fillna('-1', )
data_frame[dense_feature_names] = data_frame[dense_feature_names].fillna(0, )
target = ['label']

# 把 每列的 乱码转化为id
for one_column_name in sparse_feature_names:
    label_encoder = LabelEncoder()
    data_frame[one_column_name] = label_encoder.fit_transform(data_frame[one_column_name])

# 把值归一化为0~1的float
min_max_scaler = MinMaxScaler(feature_range=(0, 1))
data_frame[dense_feature_names] = min_max_scaler.fit_transform(data_frame[dense_feature_names])

代码改自
https://github.com/shenweichen/DeepCTR/blob/master/examples/run_classification_criteo.py

Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐