# 导入相关包跟模块
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x : '%.2f' % x)  # pandas禁用科学计数法
# 读取数据
data = pd.read_csv('data.csv', index_col=0)  # 不需要第一列
data.shape
(564169, 11)
# 查看字段
data.columns
Index(['event_time', 'order_id', 'product_id', 'category_id', 'category_code',
       'brand', 'price', 'user_id', 'age', 'sex', 'local'],
      dtype='object')
# 数据预览
data.head()
event_time order_id product_id category_id category_code brand price user_id age sex local
0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南
1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南
2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京
3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京
4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 广东
# 数据概览
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     564169 non-null  object 
 1   order_id       564169 non-null  int64  
 2   product_id     564169 non-null  int64  
 3   category_id    564169 non-null  float64
 4   category_code  434799 non-null  object 
 5   brand          536945 non-null  object 
 6   price          564169 non-null  float64
 7   user_id        564169 non-null  float64
 8   age            564169 non-null  float64
 9   sex            564169 non-null  object 
 10  local          564169 non-null  object 
dtypes: float64(4), int64(2), object(5)
memory usage: 51.7+ MB

数据预处理

  • 缺失值
  • 重复值
  • 异常值
# 处理日期列
data['date'] = data.event_time.apply(lambda x: x.split(' ')[0])
data.head()
event_time order_id product_id category_id category_code brand price user_id age sex local date
0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24
1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24
2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24
3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24
4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 广东 2020-04-24
# 转化为日期格式
data['date'] = pd.to_datetime(data['date'])
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   event_time     564169 non-null  object        
 1   order_id       564169 non-null  int64         
 2   product_id     564169 non-null  int64         
 3   category_id    564169 non-null  float64       
 4   category_code  434799 non-null  object        
 5   brand          536945 non-null  object        
 6   price          564169 non-null  float64       
 7   user_id        564169 non-null  float64       
 8   age            564169 non-null  float64       
 9   sex            564169 non-null  object        
 10  local          564169 non-null  object        
 11  date           564169 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2), object(5)
memory usage: 56.0+ MB
# 提取月份
data['month'] = data.date.dt.month
data.head()
event_time order_id product_id category_id category_code brand price user_id age sex local date month
0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24 4
1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24 4
2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24 4
3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24 4
4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 广东 2020-04-24 4
# 提取星期几,0为周一,6为星期六
# data['weekday1'] = data.date.dt.weekday
# data.head()
# 0为星期日,1为星期一,2为星期二
data['weekday'] = data.date.apply(lambda x: x.strftime("%w"))
data.head()
event_time order_id product_id category_id category_code brand price user_id age sex local date month weekday
0 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24 4 5
1 2020-04-24 11:50:39 UTC 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24 4 5
2 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24 4 5
3 2020-04-24 14:37:43 UTC 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24 4 5
4 2020-04-24 19:16:21 UTC 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 广东 2020-04-24 4 5
# 删除原来字符时间列
# data.drop(labels=['event_time'], inplace=True)
del data['event_time']
data.head()
order_id product_id category_id category_code brand price user_id age sex local date month weekday
0 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24 4 5
1 2294359932054536986 1515966223509089906 2268105426648171520.00 electronics.tablet samsung 162.01 1515915625441993984.00 24.00 海南 2020-04-24 4 5
2 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24 4 5
3 2294444024058086220 2273948319057183658 2268105430162997248.00 electronics.audio.headphone huawei 77.52 1515915625447879424.00 38.00 北京 2020-04-24 4 5
4 2294584263154074236 2273948316817424439 2268105471367840000.00 NaN karcher 217.57 1515915625443148032.00 32.00 广东 2020-04-24 4 5
# 缺失值处理
data.isnull().sum() #  查看缺失值情况
# 发现数据中category_code(产品品类)和brand(品牌)两列存在数据缺失,
# 因为产品品类缺失的比较多,直接用‘M’来替代缺失值;而brand缺失比较少,
# 不会对结果产生影响,直接删除缺失值;
order_id              0
product_id            0
category_id           0
category_code    129370
brand             27224
price                 0
user_id               0
age                   0
sex                   0
local                 0
date                  0
month                 0
weekday               0
dtype: int64
# 用特殊字符“M”替换缺失值
data['category_code'] = data['category_code'].fillna('M')
# 丢掉brand字段为空的行
data = data[data.brand.notnull()]  #  直接过滤
data.isnull().sum()
order_id         0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
age              0
sex              0
local            0
date             0
month            0
weekday          0
dtype: int64
# 重复值处理
data.duplicated().sum()  # 查看重复行
# 存在634条重复数据,但是真实的销售数据中可能存在一个用户在同天下单两次或多次的情况,
# 所以这里的处理方法是增加购买数量和总价格的列,
# 而不对重复值直接删除处理
634
# 增加新列:购买数量
data = data.value_counts().reset_index().rename(columns={0: "buy_count"})
# 增加新列:购买总金额
data['buy_amount'] = data['price']*data['buy_count']
data.head()
order_id product_id category_id category_code brand price user_id age sex local date month weekday buy_count buy_amount
0 2318945879811162983 2309018204833317816 2268105479144079872.00 M compliment 0.56 1515915625465863936.00 28.00 浙江 2020-05-28 5 4 4 2.24
1 2295740594749702229 1515966223509104892 2268105428166508800.00 electronics.smartphone apple 1387.01 1515915625448766464.00 21.00 北京 2020-04-26 4 0 4 5548.04
2 2388440981134674698 1515966223509106757 2360741867017995776.00 appliances.environment.air_conditioner samsung 366.41 1515915625514599680.00 50.00 广东 2020-11-16 11 1 4 1465.64
3 2375043331555066740 2273948308370096764 2268105409048871168.00 computers.network.router altel 57.85 1515915625504379136.00 19.00 上海 2020-08-13 8 4 4 231.40
4 2334999887038383089 1515966223509090031 2268105402673529600.00 M vitek 18.50 1515915625447765248.00 18.00 广东 2020-06-19 6 5 3 55.50
# 异常值处理
data.describe(percentiles=[0.01, 0.25, 0.75, 0.99]).T
# 以上7个字段均无异常情况:price和amount最小值为0,
# 可能是免费商品或者赠品,不属于异常情况;
count mean std min 1% 25% 50% 75% 99% max
order_id 536311.00 2370510904966508032.00 20245175202374012.00 2294359932054536960.00 2305277023666307584.00 2353674406846267392.00 2376454570843832320.00 2388440981134596608.00 2388440981134690304.00 2388440981134693888.00
product_id 536311.00 1692699904736436480.00 327324678972381504.00 1515966223509088512.00 1515966223509088512.00 1515966223509104896.00 1515966223509261824.00 1515966223527326208.00 2388434452475807744.00 2388434452476881920.00
category_id 536311.00 2273068434487365888.00 21891838583158944.00 2268105388421284352.00 2268105388991709952.00 2268105406549066752.00 2268105428166508800.00 2268105439323357952.00 2374498914000592384.00 2374498914001945600.00
price 536311.00 214.54 305.98 0.00 1.13 24.51 99.51 289.33 1387.01 11574.05
user_id 536311.00 1515915625493883648.00 24990827.60 1515915625439951872.00 1515915625441152000.00 1515915625467037184.00 1515915625486696704.00 1515915625511521280.00 1515915625514800128.00 1515915625514891264.00
age 536311.00 33.18 10.12 16.00 16.00 24.00 33.00 42.00 50.00 50.00
month 536311.00 7.72 2.56 1.00 1.00 6.00 8.00 10.00 11.00 11.00
buy_count 536311.00 1.00 0.04 1.00 1.00 1.00 1.00 1.00 1.00 4.00
buy_amount 536311.00 214.73 306.48 0.00 1.13 24.98 99.51 289.33 1387.01 11574.05
# 同时查看其它数据是否存在异常情况
data.describe(include='all').T
# 发现日期中存在“1970-01-01”的数据,应该是异常数据,予以删除;
count unique top freq first last mean std min 25% 50% 75% max
order_id 536311.00 NaN NaN NaN NaT NaT 2370510904966508032.00 20245175202374012.00 2294359932054536960.00 2353674406846267392.00 2376454570843832320.00 2388440981134596608.00 2388440981134693888.00
product_id 536311.00 NaN NaN NaN NaT NaT 1692699904736436480.00 327324678972381504.00 1515966223509088512.00 1515966223509104896.00 1515966223509261824.00 1515966223527326208.00 2388434452476881920.00
category_id 536311.00 NaN NaN NaN NaT NaT 2273068434487365888.00 21891838583158944.00 2268105388421284352.00 2268105406549066752.00 2268105428166508800.00 2268105439323357952.00 2374498914001945600.00
category_code 536311 124 M 116093 NaT NaT NaN NaN NaN NaN NaN NaN NaN
brand 536311 868 samsung 96123 NaT NaT NaN NaN NaN NaN NaN NaN NaN
price 536311.00 NaN NaN NaN NaT NaT 214.54 305.98 0.00 24.51 99.51 289.33 11574.05
user_id 536311.00 NaN NaN NaN NaT NaT 1515915625493883648.00 24990827.60 1515915625439951872.00 1515915625467037184.00 1515915625486696704.00 1515915625511521280.00 1515915625514891264.00
age 536311.00 NaN NaN NaN NaT NaT 33.18 10.12 16.00 24.00 33.00 42.00 50.00
sex 536311 2 270454 NaT NaT NaN NaN NaN NaN NaN NaN NaN
local 536311 11 广东 117097 NaT NaT NaN NaN NaN NaN NaN NaN NaN
date 536311 323 2020-10-22 00:00:00 8310 1970-01-01 2020-11-21 NaN NaN NaN NaN NaN NaN NaN
month 536311.00 NaN NaN NaN NaT NaT 7.72 2.56 1.00 6.00 8.00 10.00 11.00
weekday 536311 7 6 86379 NaT NaT NaN NaN NaN NaN NaN NaN NaN
buy_count 536311.00 NaN NaN NaN NaT NaT 1.00 0.04 1.00 1.00 1.00 1.00 4.00
buy_amount 536311.00 NaN NaN NaN NaT NaT 214.73 306.48 0.00 24.98 99.51 289.33 11574.05
# 去掉日期异常值
data = data[data.date>'1970-01-01'] # 直接过滤掉
data.date.min()
Timestamp('2020-01-05 00:00:00')
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 535065 entries, 0 to 536310
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   order_id       535065 non-null  int64         
 1   product_id     535065 non-null  int64         
 2   category_id    535065 non-null  float64       
 3   category_code  535065 non-null  object        
 4   brand          535065 non-null  object        
 5   price          535065 non-null  float64       
 6   user_id        535065 non-null  float64       
 7   age            535065 non-null  float64       
 8   sex            535065 non-null  object        
 9   local          535065 non-null  object        
 10  date           535065 non-null  datetime64[ns]
 11  month          535065 non-null  int64         
 12  weekday        535065 non-null  object        
 13  buy_count      535065 non-null  int64         
 14  buy_amount     535065 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(4), object(5)
memory usage: 65.3+ MB
data.shape
(535065, 15)

数据分析

  • 基础维度分析:地域、年龄、性别、0元用户
  • 产品分析:销量前10、销售额前10、销量前5人群特征
  • 用户分析:生命周期模型、帕累托模型、RFM模型
# 地区维度销量
data_local = data.groupby('local')['user_id'].nunique().reset_index().sort_values('user_id', ascending=False)
data_local.rename(columns={'user_id':'用户数'}, inplace=True)
data_local
local 用户数
4 广东 21382
0 上海 16031
1 北京 15928
5 江苏 5561
7 海南 5449
2 四川 5445
6 浙江 5370
8 湖北 5355
10 重庆 5342
3 天津 5337
9 湖南 5330
# 使用pyecharts进行可视化
from pyecharts import options as opts
from pyecharts.charts import Bar, Line
# 地域维度
bar = Bar()
bar.add_xaxis(data_local.local.tolist())
bar.add_yaxis('用户数',data_local['用户数'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各地区域用户数量情况",subtitle="区域用户数"),  #  设置主副标题
                    xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()
    <div id="dae8b16860cc447b9d61f32b0227b510" style="width:900px; height:500px;"></div>
# 性别维度
sex = data.groupby('sex')['user_id'].nunique().reset_index()
sex.rename(columns={'user_id':'用户数量'}, inplace=True)
sex
sex 用户数量
0 47235
1 47628
bar = Bar()
bar.add_xaxis(sex.sex.tolist())
bar.add_yaxis('用户',sex['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="性别分类用户数情况",subtitle="用户数"),
                    xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="da7684cee9fd43ff83621c244ab2c8f6" style="width:900px; height:500px;"></div>
# 年龄维度
age = data.groupby('age')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age
age 用户数量
27 43.00 2875
28 44.00 2854
16 32.00 2841
4 20.00 2838
12 28.00 2825
7 23.00 2820
22 38.00 2820
23 39.00 2813
20 36.00 2805
13 29.00 2802
0 16.00 2797
10 26.00 2793
6 22.00 2792
5 21.00 2786
29 45.00 2784
32 48.00 2781
15 31.00 2772
9 25.00 2768
8 24.00 2764
19 35.00 2764
26 42.00 2763
2 18.00 2759
14 30.00 2755
25 41.00 2753
17 33.00 2745
18 34.00 2738
1 17.00 2725
11 27.00 2724
34 50.00 2724
30 46.00 2713
21 37.00 2710
33 49.00 2707
3 19.00 2703
24 40.00 2685
31 47.00 2677
# 数据太多,直接可视化会看不清,做分箱处理
data_age = data.copy()
bins = [15, 20, 25, 30, 35, 40, 45, 50]
labels = ['(15-20]', '(20-25]', '(25-30]', '(30-35]', '(35-40]', '(40-45]', '(45-50]']
data_age['age_bin'] = pd.cut(x=data.age, bins=bins, labels=labels)

age = data_age.groupby('age_bin')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age
age_bin 用户数量
5 (40-45] 13969
1 (20-25] 13867
2 (25-30] 13831
3 (30-35] 13802
4 (35-40] 13775
0 (15-20] 13726
6 (45-50] 13535
bar = Bar()
bar.add_xaxis(age.age_bin.tolist())
bar.add_yaxis('用户',age['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各年龄用户数情况",subtitle="用户数"),
                    xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="4cf24a3b14b24b4e902b467ca391133d" style="width:900px; height:500px;"></div>
# 不同年龄段人群的消费差异
age_bin_data = data_age.groupby('age_bin').agg(消费金额=('buy_amount', 'sum'), 下单次数=('order_id', 'nunique'))
age_bin_data
消费金额 下单次数
age_bin
(15-20] 16007287.18 54302
(20-25] 16500743.70 55546
(25-30] 16513446.00 54723
(30-35] 17004578.70 58275
(35-40] 14995577.44 49744
(40-45] 17078724.54 59067
(45-50] 16886278.32 57085
bar1 = (
    Bar()
    .add_xaxis(list(age_bin_data.index))
    .add_yaxis('消费金额', list(age_bin_data['消费金额']), yaxis_index=0)
    .extend_axis(yaxis=opts.AxisOpts(type_="value",
#                                      axistick_opts=opts.AxisTickOpts(is_show=True),
                                     splitline_opts=opts.SplitLineOpts(is_show=True),))  # 双坐标轴 这个必须添加,添加虚线
    .set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='消费金额'))
)
bar2 = (
    Bar()
    .add_xaxis(list(age_bin_data.index))
    .add_yaxis('下单次数', list(age_bin_data['下单次数']),yaxis_index=1)
    .set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='下单次数'))
)
bar1.overlap(bar2)
bar1.render_notebook()
    <div id="f0f2a988d9144ec1aeef7b7d28826084" style="width:900px; height:500px;"></div>
# 0元用户
user_0 = data[data['price']==0]['user_id'].reset_index(drop=True)
user_0.shape
(30,)
user_0[~user_0.isin(data[data['price']>0]['user_id'])]
# 30个0元用户中,只有一位是没有产生消费的,可能是中奖用户
21   1515915625468531712.00
Name: user_id, dtype: float64
# 销量前10的产品
top = data.groupby('product_id').agg(销售总量=('buy_count','sum')).reset_index().sort_values('销售总量', ascending=False).reset_index(drop=True).head(10)
top
product_id 销售总量
0 1515966223517846928 2759
1 1515966223509106786 2597
2 1515966223509088532 2550
3 1515966223509088613 2549
4 1515966223509088567 2497
5 1515966223510174551 2370
6 1515966223509088521 2156
7 1515966223509104145 2037
8 1515966223509088639 1904
9 1515966223509117074 1797
bar = Bar()
bar.add_xaxis(top.product_id.tolist())
bar.add_yaxis('销量',top['销售总量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销量前十", subtitle='产品销售量'),
                    xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="7038c9df402a44bfb1327f249699ac13" style="width:900px; height:500px;"></div>
# 销售前十的产品类别
category_top = data[data.category_code != "M"].groupby('category_code').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(10)
category_top
category_code 销量
88 electronics.smartphone 102169
58 computers.notebook 25860
30 appliances.kitchen.refrigerators 20020
81 electronics.audio.headphone 19739
92 electronics.video.tv 17623
13 appliances.environment.vacuum 15906
33 appliances.kitchen.washer 14163
25 appliances.kitchen.kettle 11869
63 computers.peripherals.mouse 10146
99 furniture.kitchen.table 9659
bar = Bar()
bar.add_xaxis(category_top['category_code'].tolist())
bar.add_yaxis('销量',category_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销量前十", subtitle="类别销量"),
                    xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="1f90c798f25c4c01a0db3a6d918d542a" style="width:900px; height:500px;"></div>
# 产品销售额情况
product_amount_top = data.groupby('product_id').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
product_amount_top
product_id 销售额
46 1515966223509088567 2138006.31
95 1515966223509088671 1540314.88
183 1515966223509089284 1062128.50
67 1515966223509088628 1007196.96
12 1515966223509088509 921041.65
251 1515966223509089438 850236.39
26 1515966223509088532 767295.00
1035 1515966223509104892 760081.48
1221 1515966223509105893 739893.40
72 1515966223509088639 661068.80
bar = Bar()
bar.add_xaxis(product_amount_top.product_id.tolist())
bar.add_yaxis('销售额',product_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销售额情况", subtitle='销售额前十产品'),
                    xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=70),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="f3a5b14750424d7a879434e0c06035f5" style="width:900px; height:500px;"></div>
# 产品类别销售额情况
category_amount_top = data[data.category_code != "M"].groupby('category_code').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
category_amount_top
category_code 销售额
88 electronics.smartphone 41214019.09
58 computers.notebook 14785855.65
92 electronics.video.tv 8724539.82
30 appliances.kitchen.refrigerators 8598364.87
33 appliances.kitchen.washer 4923616.33
13 appliances.environment.vacuum 2439553.99
89 electronics.tablet 2343150.48
87 electronics.clocks 2180030.73
9 appliances.environment.air_conditioner 1826364.85
23 appliances.kitchen.hood 1799268.89
bar = Bar()
bar.add_xaxis(category_amount_top.category_code.tolist())
bar.add_yaxis('销售额',category_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销售额情况", subtitle='销售额前十产品类别'),
                    xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=75),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="0cd0495fa04746748323be044d107b4c" style="width:900px; height:500px;"></div>
# 手机销量表现最好的前5个品牌
brand_top = data[data.category_code == 'electronics.smartphone'].groupby('brand').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(5)
brand_top
brand 销量
14 samsung 51376
1 apple 23365
18 xiaomi 8328
7 huawei 7738
12 oppo 6876
bar = Bar()
bar.add_xaxis(brand_top.brand.tolist())
bar.add_yaxis('销量',brand_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="手机品牌销量情况", subtitle='销量前5手机品牌'),
                    xaxis_opts=opts.AxisOpts(name='手机品牌', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="22e71d28a7cc4bc29596388da0bded6d" style="width:900px; height:500px;"></div>
#  三星手机用户的分析, 筛选手机品牌为  sumsung  且产品类型为  electronics.smartphone 
age_bin_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('age_bin').agg(销量=('buy_count','sum')).reset_index().sort_values('销量',ascending=False)
age_bin_sumsung_phone
age_bin 销量
5 (40-45] 7651
6 (45-50] 7640
3 (30-35] 7593
2 (25-30] 7320
0 (15-20] 7273
1 (20-25] 7125
4 (35-40] 6774
bar = Bar()
bar.add_xaxis(age_bin_sumsung_phone.age_bin.tolist())
bar.add_yaxis('销量',age_bin_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各年龄段情况", subtitle="三星手机用户画像"),
                    xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()
    <div id="d39450a08c40435d9334a37616a63d8e" style="width:900px; height:500px;"></div>
local_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('local').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量',ascending=False)
local_sumsung_phone
local 销量
4 广东 11491
0 上海 8635
1 北京 8356
2 四川 3231
9 湖南 3120
7 海南 2874
10 重庆 2851
6 浙江 2828
5 江苏 2717
3 天津 2684
8 湖北 2589
bar = Bar()
bar.add_xaxis(local_sumsung_phone.local.tolist())
bar.add_yaxis('销量',local_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各区域销量情况", subtitle="区域用户画像"),
                    xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="591a52be7a6d47e6bd7c33646258990f" style="width:900px; height:500px;"></div>
sex_samsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('sex').agg(销量=('buy_count','sum')).reset_index()
sex_samsung_phone
sex 销量
0 25477
1 25899
bar = Bar()
bar.add_xaxis(sex_samsung_phone.sex.tolist())
bar.add_yaxis('销量',sex_samsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各性别销量情况", subtitle="性别用户画像"),
                    xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
    <div id="54166c92e64b4d61ae7b039fbf56c68c" style="width:900px; height:500px;"></div>

用户生命周期模型

pivoted_amount = data[data['buy_amount']>0].pivot_table(index='user_id', 
                                       columns='month',
                                      values='buy_count',
                                      aggfunc='sum').fillna(0)
pivoted_amount.head()
month 1 2 3 4 5 6 7 8 9 10 11
user_id
1515915625439951872.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00
1515915625440038400.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00
1515915625440051712.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 3.00 10.00
1515915625440099840.00 0.00 0.00 0.00 0.00 2.00 1.00 1.00 0.00 5.00 10.00 2.00
1515915625440121600.00 0.00 0.00 0.00 0.00 1.00 0.00 1.00 0.00 0.00 0.00 0.00
columns_month = pivoted_amount.columns.astype('str')  # 必须把列名转为字符
pivoted_amount.columns = columns_month
pivoted_purchase = pivoted_amount.applymap(lambda x: 1 if x>0 else 0)
pivoted_purchase.head()
month 1 2 3 4 5 6 7 8 9 10 11
user_id
1515915625439951872.00 0 0 0 0 0 0 1 0 0 0 0
1515915625440038400.00 0 0 0 0 0 0 0 0 1 1 0
1515915625440051712.00 0 0 0 0 0 0 0 0 0 1 1
1515915625440099840.00 0 0 0 0 1 1 1 0 1 1 1
1515915625440121600.00 0 0 0 0 1 0 1 0 0 0 0
def active_status(data):
    status =[]
    for i in range(11):
        #若本月没有消费
        if data[i] ==0:
            if len(status)>0:  #如果不是第一个月,
                if status[i-1]=='未注册': #如果上个月已经是未注册,那么本月也是未注册
                    status.append('未注册') 
                else:  #如果上月已注册,则本月为不活跃
                    status.append('不活跃')
            else:  #如果是第一个月
                status.append('未注册') #则未注册
        #若本月消费
        else:
            if len(status)==0: #如果是第一个月,则为新注册用户
                status.append('新客户')
            else:  #如果不是第一个月
                if status[i-1]=='不活跃':  #如果上月为不活跃,那么本月为回流
                    status.append('回流')
                elif status[i-1]=='未注册':  #如果上月为未注册,那么本月为新注册
                    status.append('新客户')
                else:  #如果上月为活跃,本月也为活跃
                    status.append('活跃')
    return pd.Series(status,index=columns_month)
pivoted_purchase_status = pivoted_purchase.apply(lambda x: active_status(x), axis=1)
pivoted_purchase_status.head()
month 1 2 3 4 5 6 7 8 9 10 11
user_id
1515915625439951872.00 未注册 未注册 未注册 未注册 未注册 未注册 新客户 不活跃 不活跃 不活跃 不活跃
1515915625440038400.00 未注册 未注册 未注册 未注册 未注册 未注册 未注册 未注册 新客户 活跃 不活跃
1515915625440051712.00 未注册 未注册 未注册 未注册 未注册 未注册 未注册 未注册 未注册 新客户 活跃
1515915625440099840.00 未注册 未注册 未注册 未注册 新客户 活跃 活跃 不活跃 回流 活跃 活跃
1515915625440121600.00 未注册 未注册 未注册 未注册 新客户 不活跃 回流 不活跃 不活跃 不活跃 不活跃
purchase_count = pivoted_purchase_status.apply(lambda x:x.value_counts())
# purchase_count.head()
# 去除未注册的数据行
purchase_count = purchase_count[purchase_count.index != "未注册"]
purchase_count = purchase_count.fillna(0)
purchase_count
month 1 2 3 4 5 6 7 8 9 10 11
不活跃 0.00 1190.00 2346 3685 5852 18118 24187 40985 64665 75528 84256.00
回流 0.00 0.00 261 432 1377 1251 3462 4244 4199 3740 2444.00
新客户 1813.00 1613.00 1491 4177 13914 8950 22036 22052 8163 5607 2938.00
活跃 0.00 623.00 819 800 1865 3639 4309 8765 7182 4941 3116.00
purchase_count = purchase_count.T
purchase_count
不活跃 回流 新客户 活跃
month
1 0.00 0.00 1813.00 0.00
2 1190.00 0.00 1613.00 623.00
3 2346.00 261.00 1491.00 819.00
4 3685.00 432.00 4177.00 800.00
5 5852.00 1377.00 13914.00 1865.00
6 18118.00 1251.00 8950.00 3639.00
7 24187.00 3462.00 22036.00 4309.00
8 40985.00 4244.00 22052.00 8765.00
9 64665.00 4199.00 8163.00 7182.00
10 75528.00 3740.00 5607.00 4941.00
11 84256.00 2444.00 2938.00 3116.00
line = Line()
line.add_xaxis(purchase_count.index.tolist())
line.add_yaxis('不活跃',purchase_count['不活跃'].tolist())
line.add_yaxis('活跃',purchase_count['活跃'].tolist())
line.add_yaxis('新客户',purchase_count['新客户'].tolist())
line.add_yaxis('回流',purchase_count['回流'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="用户生命周期分层情况", subtitle="用户分层"),
                    xaxis_opts=opts.AxisOpts(name='月份', name_location='middle', name_gap=35),  #  设置X轴标签
                    yaxis_opts=opts.AxisOpts(name='分层', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()
    <div id="5314f0ff88d54f9b9d2dc2234bab163d" style="width:900px; height:500px;"></div>

帕累托模型

# 查询每个用户的消费金额以及累计消费金额
user_2_8 = data.groupby('user_id').agg(消费金额=('buy_amount', 'sum')).sort_values('消费金额', ascending=False).reset_index()
user_2_8['累计销售额'] = user_2_8['消费金额'].cumsum()
user_2_8.head()
user_id 消费金额 累计销售额
0 1515915625512422912.00 160604.07 160604.07
1 1515915625513695488.00 158277.37 318881.44
2 1515915625512377088.00 149967.06 468848.50
3 1515915625513577472.00 135672.84 604521.34
4 1515915625514597888.00 133945.88 738467.22
p = user_2_8['消费金额'].cumsum() / user_2_8['消费金额'].sum()
key = p[p>0.8].index[0]
key
25408
key / user_2_8.shape[0]
0.2739259339119185
line = Line()
line.add_xaxis(user_2_8.index.tolist())
line.add_yaxis('累计销售额',user_2_8['累计销售额'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="帕累托模型", subtitle="累计销售额"),
#                     xaxis_opts=opts.AxisOpts(name='', name_location='middle', name_gap=35),  #  设置X轴标签
#                     yaxis_opts=opts.AxisOpts(name='', name_location='middle', name_gap=60),  #  设置Y周标签
                   toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()
    <div id="6084df0633cf4a048896a735c59a7adf" style="width:900px; height:500px;"></div>
Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐