import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', lambda x : '%.2f' % x)
data = pd.read_csv('data.csv', index_col=0)
data.shape
(564169, 11)
data.columns
Index(['event_time', 'order_id', 'product_id', 'category_id', 'category_code',
'brand', 'price', 'user_id', 'age', 'sex', 'local'],
dtype='object')
data.head()
|
event_time |
order_id |
product_id |
category_id |
category_code |
brand |
price |
user_id |
age |
sex |
local |
| 0 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
| 1 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
| 2 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
| 3 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
| 4 |
2020-04-24 19:16:21 UTC |
2294584263154074236 |
2273948316817424439 |
2268105471367840000.00 |
NaN |
karcher |
217.57 |
1515915625443148032.00 |
32.00 |
女 |
广东 |
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 event_time 564169 non-null object
1 order_id 564169 non-null int64
2 product_id 564169 non-null int64
3 category_id 564169 non-null float64
4 category_code 434799 non-null object
5 brand 536945 non-null object
6 price 564169 non-null float64
7 user_id 564169 non-null float64
8 age 564169 non-null float64
9 sex 564169 non-null object
10 local 564169 non-null object
dtypes: float64(4), int64(2), object(5)
memory usage: 51.7+ MB
数据预处理
data['date'] = data.event_time.apply(lambda x: x.split(' ')[0])
data.head()
|
event_time |
order_id |
product_id |
category_id |
category_code |
brand |
price |
user_id |
age |
sex |
local |
date |
| 0 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
| 1 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
| 2 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
| 3 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
| 4 |
2020-04-24 19:16:21 UTC |
2294584263154074236 |
2273948316817424439 |
2268105471367840000.00 |
NaN |
karcher |
217.57 |
1515915625443148032.00 |
32.00 |
女 |
广东 |
2020-04-24 |
data['date'] = pd.to_datetime(data['date'])
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 564169 entries, 0 to 2633520
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 event_time 564169 non-null object
1 order_id 564169 non-null int64
2 product_id 564169 non-null int64
3 category_id 564169 non-null float64
4 category_code 434799 non-null object
5 brand 536945 non-null object
6 price 564169 non-null float64
7 user_id 564169 non-null float64
8 age 564169 non-null float64
9 sex 564169 non-null object
10 local 564169 non-null object
11 date 564169 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(4), int64(2), object(5)
memory usage: 56.0+ MB
data['month'] = data.date.dt.month
data.head()
|
event_time |
order_id |
product_id |
category_id |
category_code |
brand |
price |
user_id |
age |
sex |
local |
date |
month |
| 0 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
4 |
| 1 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
4 |
| 2 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
4 |
| 3 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
4 |
| 4 |
2020-04-24 19:16:21 UTC |
2294584263154074236 |
2273948316817424439 |
2268105471367840000.00 |
NaN |
karcher |
217.57 |
1515915625443148032.00 |
32.00 |
女 |
广东 |
2020-04-24 |
4 |
data['weekday'] = data.date.apply(lambda x: x.strftime("%w"))
data.head()
|
event_time |
order_id |
product_id |
category_id |
category_code |
brand |
price |
user_id |
age |
sex |
local |
date |
month |
weekday |
| 0 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
4 |
5 |
| 1 |
2020-04-24 11:50:39 UTC |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
4 |
5 |
| 2 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
4 |
5 |
| 3 |
2020-04-24 14:37:43 UTC |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
4 |
5 |
| 4 |
2020-04-24 19:16:21 UTC |
2294584263154074236 |
2273948316817424439 |
2268105471367840000.00 |
NaN |
karcher |
217.57 |
1515915625443148032.00 |
32.00 |
女 |
广东 |
2020-04-24 |
4 |
5 |
del data['event_time']
data.head()
|
order_id |
product_id |
category_id |
category_code |
brand |
price |
user_id |
age |
sex |
local |
date |
month |
weekday |
| 0 |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
4 |
5 |
| 1 |
2294359932054536986 |
1515966223509089906 |
2268105426648171520.00 |
electronics.tablet |
samsung |
162.01 |
1515915625441993984.00 |
24.00 |
女 |
海南 |
2020-04-24 |
4 |
5 |
| 2 |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
4 |
5 |
| 3 |
2294444024058086220 |
2273948319057183658 |
2268105430162997248.00 |
electronics.audio.headphone |
huawei |
77.52 |
1515915625447879424.00 |
38.00 |
女 |
北京 |
2020-04-24 |
4 |
5 |
| 4 |
2294584263154074236 |
2273948316817424439 |
2268105471367840000.00 |
NaN |
karcher |
217.57 |
1515915625443148032.00 |
32.00 |
女 |
广东 |
2020-04-24 |
4 |
5 |
data.isnull().sum()
order_id 0
product_id 0
category_id 0
category_code 129370
brand 27224
price 0
user_id 0
age 0
sex 0
local 0
date 0
month 0
weekday 0
dtype: int64
data['category_code'] = data['category_code'].fillna('M')
data = data[data.brand.notnull()]
data.isnull().sum()
order_id 0
product_id 0
category_id 0
category_code 0
brand 0
price 0
user_id 0
age 0
sex 0
local 0
date 0
month 0
weekday 0
dtype: int64
data.duplicated().sum()
634
data = data.value_counts().reset_index().rename(columns={0: "buy_count"})
data['buy_amount'] = data['price']*data['buy_count']
data.head()
|
order_id |
product_id |
category_id |
category_code |
brand |
price |
user_id |
age |
sex |
local |
date |
month |
weekday |
buy_count |
buy_amount |
| 0 |
2318945879811162983 |
2309018204833317816 |
2268105479144079872.00 |
M |
compliment |
0.56 |
1515915625465863936.00 |
28.00 |
女 |
浙江 |
2020-05-28 |
5 |
4 |
4 |
2.24 |
| 1 |
2295740594749702229 |
1515966223509104892 |
2268105428166508800.00 |
electronics.smartphone |
apple |
1387.01 |
1515915625448766464.00 |
21.00 |
男 |
北京 |
2020-04-26 |
4 |
0 |
4 |
5548.04 |
| 2 |
2388440981134674698 |
1515966223509106757 |
2360741867017995776.00 |
appliances.environment.air_conditioner |
samsung |
366.41 |
1515915625514599680.00 |
50.00 |
男 |
广东 |
2020-11-16 |
11 |
1 |
4 |
1465.64 |
| 3 |
2375043331555066740 |
2273948308370096764 |
2268105409048871168.00 |
computers.network.router |
altel |
57.85 |
1515915625504379136.00 |
19.00 |
女 |
上海 |
2020-08-13 |
8 |
4 |
4 |
231.40 |
| 4 |
2334999887038383089 |
1515966223509090031 |
2268105402673529600.00 |
M |
vitek |
18.50 |
1515915625447765248.00 |
18.00 |
男 |
广东 |
2020-06-19 |
6 |
5 |
3 |
55.50 |
data.describe(percentiles=[0.01, 0.25, 0.75, 0.99]).T
|
count |
mean |
std |
min |
1% |
25% |
50% |
75% |
99% |
max |
| order_id |
536311.00 |
2370510904966508032.00 |
20245175202374012.00 |
2294359932054536960.00 |
2305277023666307584.00 |
2353674406846267392.00 |
2376454570843832320.00 |
2388440981134596608.00 |
2388440981134690304.00 |
2388440981134693888.00 |
| product_id |
536311.00 |
1692699904736436480.00 |
327324678972381504.00 |
1515966223509088512.00 |
1515966223509088512.00 |
1515966223509104896.00 |
1515966223509261824.00 |
1515966223527326208.00 |
2388434452475807744.00 |
2388434452476881920.00 |
| category_id |
536311.00 |
2273068434487365888.00 |
21891838583158944.00 |
2268105388421284352.00 |
2268105388991709952.00 |
2268105406549066752.00 |
2268105428166508800.00 |
2268105439323357952.00 |
2374498914000592384.00 |
2374498914001945600.00 |
| price |
536311.00 |
214.54 |
305.98 |
0.00 |
1.13 |
24.51 |
99.51 |
289.33 |
1387.01 |
11574.05 |
| user_id |
536311.00 |
1515915625493883648.00 |
24990827.60 |
1515915625439951872.00 |
1515915625441152000.00 |
1515915625467037184.00 |
1515915625486696704.00 |
1515915625511521280.00 |
1515915625514800128.00 |
1515915625514891264.00 |
| age |
536311.00 |
33.18 |
10.12 |
16.00 |
16.00 |
24.00 |
33.00 |
42.00 |
50.00 |
50.00 |
| month |
536311.00 |
7.72 |
2.56 |
1.00 |
1.00 |
6.00 |
8.00 |
10.00 |
11.00 |
11.00 |
| buy_count |
536311.00 |
1.00 |
0.04 |
1.00 |
1.00 |
1.00 |
1.00 |
1.00 |
1.00 |
4.00 |
| buy_amount |
536311.00 |
214.73 |
306.48 |
0.00 |
1.13 |
24.98 |
99.51 |
289.33 |
1387.01 |
11574.05 |
data.describe(include='all').T
|
count |
unique |
top |
freq |
first |
last |
mean |
std |
min |
25% |
50% |
75% |
max |
| order_id |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
2370510904966508032.00 |
20245175202374012.00 |
2294359932054536960.00 |
2353674406846267392.00 |
2376454570843832320.00 |
2388440981134596608.00 |
2388440981134693888.00 |
| product_id |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
1692699904736436480.00 |
327324678972381504.00 |
1515966223509088512.00 |
1515966223509104896.00 |
1515966223509261824.00 |
1515966223527326208.00 |
2388434452476881920.00 |
| category_id |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
2273068434487365888.00 |
21891838583158944.00 |
2268105388421284352.00 |
2268105406549066752.00 |
2268105428166508800.00 |
2268105439323357952.00 |
2374498914001945600.00 |
| category_code |
536311 |
124 |
M |
116093 |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| brand |
536311 |
868 |
samsung |
96123 |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| price |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
214.54 |
305.98 |
0.00 |
24.51 |
99.51 |
289.33 |
11574.05 |
| user_id |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
1515915625493883648.00 |
24990827.60 |
1515915625439951872.00 |
1515915625467037184.00 |
1515915625486696704.00 |
1515915625511521280.00 |
1515915625514891264.00 |
| age |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
33.18 |
10.12 |
16.00 |
24.00 |
33.00 |
42.00 |
50.00 |
| sex |
536311 |
2 |
男 |
270454 |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| local |
536311 |
11 |
广东 |
117097 |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| date |
536311 |
323 |
2020-10-22 00:00:00 |
8310 |
1970-01-01 |
2020-11-21 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| month |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
7.72 |
2.56 |
1.00 |
6.00 |
8.00 |
10.00 |
11.00 |
| weekday |
536311 |
7 |
6 |
86379 |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
| buy_count |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
1.00 |
0.04 |
1.00 |
1.00 |
1.00 |
1.00 |
4.00 |
| buy_amount |
536311.00 |
NaN |
NaN |
NaN |
NaT |
NaT |
214.73 |
306.48 |
0.00 |
24.98 |
99.51 |
289.33 |
11574.05 |
data = data[data.date>'1970-01-01']
data.date.min()
Timestamp('2020-01-05 00:00:00')
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 535065 entries, 0 to 536310
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 535065 non-null int64
1 product_id 535065 non-null int64
2 category_id 535065 non-null float64
3 category_code 535065 non-null object
4 brand 535065 non-null object
5 price 535065 non-null float64
6 user_id 535065 non-null float64
7 age 535065 non-null float64
8 sex 535065 non-null object
9 local 535065 non-null object
10 date 535065 non-null datetime64[ns]
11 month 535065 non-null int64
12 weekday 535065 non-null object
13 buy_count 535065 non-null int64
14 buy_amount 535065 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(4), object(5)
memory usage: 65.3+ MB
data.shape
(535065, 15)
数据分析
- 基础维度分析:地域、年龄、性别、0元用户
- 产品分析:销量前10、销售额前10、销量前5人群特征
- 用户分析:生命周期模型、帕累托模型、RFM模型
data_local = data.groupby('local')['user_id'].nunique().reset_index().sort_values('user_id', ascending=False)
data_local.rename(columns={'user_id':'用户数'}, inplace=True)
data_local
|
local |
用户数 |
| 4 |
广东 |
21382 |
| 0 |
上海 |
16031 |
| 1 |
北京 |
15928 |
| 5 |
江苏 |
5561 |
| 7 |
海南 |
5449 |
| 2 |
四川 |
5445 |
| 6 |
浙江 |
5370 |
| 8 |
湖北 |
5355 |
| 10 |
重庆 |
5342 |
| 3 |
天津 |
5337 |
| 9 |
湖南 |
5330 |
from pyecharts import options as opts
from pyecharts.charts import Bar, Line
bar = Bar()
bar.add_xaxis(data_local.local.tolist())
bar.add_yaxis('用户数',data_local['用户数'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各地区域用户数量情况",subtitle="区域用户数"),
xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),
toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()
<div id="dae8b16860cc447b9d61f32b0227b510" style="width:900px; height:500px;"></div>
sex = data.groupby('sex')['user_id'].nunique().reset_index()
sex.rename(columns={'user_id':'用户数量'}, inplace=True)
sex
|
sex |
用户数量 |
| 0 |
女 |
47235 |
| 1 |
男 |
47628 |
bar = Bar()
bar.add_xaxis(sex.sex.tolist())
bar.add_yaxis('用户',sex['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="性别分类用户数情况",subtitle="用户数"),
xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="da7684cee9fd43ff83621c244ab2c8f6" style="width:900px; height:500px;"></div>
age = data.groupby('age')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age
|
age |
用户数量 |
| 27 |
43.00 |
2875 |
| 28 |
44.00 |
2854 |
| 16 |
32.00 |
2841 |
| 4 |
20.00 |
2838 |
| 12 |
28.00 |
2825 |
| 7 |
23.00 |
2820 |
| 22 |
38.00 |
2820 |
| 23 |
39.00 |
2813 |
| 20 |
36.00 |
2805 |
| 13 |
29.00 |
2802 |
| 0 |
16.00 |
2797 |
| 10 |
26.00 |
2793 |
| 6 |
22.00 |
2792 |
| 5 |
21.00 |
2786 |
| 29 |
45.00 |
2784 |
| 32 |
48.00 |
2781 |
| 15 |
31.00 |
2772 |
| 9 |
25.00 |
2768 |
| 8 |
24.00 |
2764 |
| 19 |
35.00 |
2764 |
| 26 |
42.00 |
2763 |
| 2 |
18.00 |
2759 |
| 14 |
30.00 |
2755 |
| 25 |
41.00 |
2753 |
| 17 |
33.00 |
2745 |
| 18 |
34.00 |
2738 |
| 1 |
17.00 |
2725 |
| 11 |
27.00 |
2724 |
| 34 |
50.00 |
2724 |
| 30 |
46.00 |
2713 |
| 21 |
37.00 |
2710 |
| 33 |
49.00 |
2707 |
| 3 |
19.00 |
2703 |
| 24 |
40.00 |
2685 |
| 31 |
47.00 |
2677 |
data_age = data.copy()
bins = [15, 20, 25, 30, 35, 40, 45, 50]
labels = ['(15-20]', '(20-25]', '(25-30]', '(30-35]', '(35-40]', '(40-45]', '(45-50]']
data_age['age_bin'] = pd.cut(x=data.age, bins=bins, labels=labels)
age = data_age.groupby('age_bin')['user_id'].nunique().reset_index()
age.rename(columns={'user_id':'用户数量'}, inplace=True)
age = age.sort_values(by='用户数量', ascending=False)
age
|
age_bin |
用户数量 |
| 5 |
(40-45] |
13969 |
| 1 |
(20-25] |
13867 |
| 2 |
(25-30] |
13831 |
| 3 |
(30-35] |
13802 |
| 4 |
(35-40] |
13775 |
| 0 |
(15-20] |
13726 |
| 6 |
(45-50] |
13535 |
bar = Bar()
bar.add_xaxis(age.age_bin.tolist())
bar.add_yaxis('用户',age['用户数量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="各年龄用户数情况",subtitle="用户数"),
xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='用户数', name_location='middle', name_gap=70),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="4cf24a3b14b24b4e902b467ca391133d" style="width:900px; height:500px;"></div>
age_bin_data = data_age.groupby('age_bin').agg(消费金额=('buy_amount', 'sum'), 下单次数=('order_id', 'nunique'))
age_bin_data
|
消费金额 |
下单次数 |
| age_bin |
|
|
| (15-20] |
16007287.18 |
54302 |
| (20-25] |
16500743.70 |
55546 |
| (25-30] |
16513446.00 |
54723 |
| (30-35] |
17004578.70 |
58275 |
| (35-40] |
14995577.44 |
49744 |
| (40-45] |
17078724.54 |
59067 |
| (45-50] |
16886278.32 |
57085 |
bar1 = (
Bar()
.add_xaxis(list(age_bin_data.index))
.add_yaxis('消费金额', list(age_bin_data['消费金额']), yaxis_index=0)
.extend_axis(yaxis=opts.AxisOpts(type_="value",
splitline_opts=opts.SplitLineOpts(is_show=True),))
.set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='消费金额'))
)
bar2 = (
Bar()
.add_xaxis(list(age_bin_data.index))
.add_yaxis('下单次数', list(age_bin_data['下单次数']),yaxis_index=1)
.set_global_opts(title_opts=opts.TitleOpts(title='各年龄段消费金额及下单次数', subtitle='下单次数'))
)
bar1.overlap(bar2)
bar1.render_notebook()
<div id="f0f2a988d9144ec1aeef7b7d28826084" style="width:900px; height:500px;"></div>
user_0 = data[data['price']==0]['user_id'].reset_index(drop=True)
user_0.shape
(30,)
user_0[~user_0.isin(data[data['price']>0]['user_id'])]
21 1515915625468531712.00
Name: user_id, dtype: float64
top = data.groupby('product_id').agg(销售总量=('buy_count','sum')).reset_index().sort_values('销售总量', ascending=False).reset_index(drop=True).head(10)
top
|
product_id |
销售总量 |
| 0 |
1515966223517846928 |
2759 |
| 1 |
1515966223509106786 |
2597 |
| 2 |
1515966223509088532 |
2550 |
| 3 |
1515966223509088613 |
2549 |
| 4 |
1515966223509088567 |
2497 |
| 5 |
1515966223510174551 |
2370 |
| 6 |
1515966223509088521 |
2156 |
| 7 |
1515966223509104145 |
2037 |
| 8 |
1515966223509088639 |
1904 |
| 9 |
1515966223509117074 |
1797 |
bar = Bar()
bar.add_xaxis(top.product_id.tolist())
bar.add_yaxis('销量',top['销售总量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销量前十", subtitle='产品销售量'),
xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="7038c9df402a44bfb1327f249699ac13" style="width:900px; height:500px;"></div>
category_top = data[data.category_code != "M"].groupby('category_code').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(10)
category_top
|
category_code |
销量 |
| 88 |
electronics.smartphone |
102169 |
| 58 |
computers.notebook |
25860 |
| 30 |
appliances.kitchen.refrigerators |
20020 |
| 81 |
electronics.audio.headphone |
19739 |
| 92 |
electronics.video.tv |
17623 |
| 13 |
appliances.environment.vacuum |
15906 |
| 33 |
appliances.kitchen.washer |
14163 |
| 25 |
appliances.kitchen.kettle |
11869 |
| 63 |
computers.peripherals.mouse |
10146 |
| 99 |
furniture.kitchen.table |
9659 |
bar = Bar()
bar.add_xaxis(category_top['category_code'].tolist())
bar.add_yaxis('销量',category_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销量前十", subtitle="类别销量"),
xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=70),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="1f90c798f25c4c01a0db3a6d918d542a" style="width:900px; height:500px;"></div>
product_amount_top = data.groupby('product_id').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
product_amount_top
|
product_id |
销售额 |
| 46 |
1515966223509088567 |
2138006.31 |
| 95 |
1515966223509088671 |
1540314.88 |
| 183 |
1515966223509089284 |
1062128.50 |
| 67 |
1515966223509088628 |
1007196.96 |
| 12 |
1515966223509088509 |
921041.65 |
| 251 |
1515966223509089438 |
850236.39 |
| 26 |
1515966223509088532 |
767295.00 |
| 1035 |
1515966223509104892 |
760081.48 |
| 1221 |
1515966223509105893 |
739893.40 |
| 72 |
1515966223509088639 |
661068.80 |
bar = Bar()
bar.add_xaxis(product_amount_top.product_id.tolist())
bar.add_yaxis('销售额',product_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品销售额情况", subtitle='销售额前十产品'),
xaxis_opts=opts.AxisOpts(name='产品', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=70),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="f3a5b14750424d7a879434e0c06035f5" style="width:900px; height:500px;"></div>
category_amount_top = data[data.category_code != "M"].groupby('category_code').agg(销售额=('buy_amount','sum')).reset_index().sort_values('销售额', ascending=False).head(10)
category_amount_top
|
category_code |
销售额 |
| 88 |
electronics.smartphone |
41214019.09 |
| 58 |
computers.notebook |
14785855.65 |
| 92 |
electronics.video.tv |
8724539.82 |
| 30 |
appliances.kitchen.refrigerators |
8598364.87 |
| 33 |
appliances.kitchen.washer |
4923616.33 |
| 13 |
appliances.environment.vacuum |
2439553.99 |
| 89 |
electronics.tablet |
2343150.48 |
| 87 |
electronics.clocks |
2180030.73 |
| 9 |
appliances.environment.air_conditioner |
1826364.85 |
| 23 |
appliances.kitchen.hood |
1799268.89 |
bar = Bar()
bar.add_xaxis(category_amount_top.category_code.tolist())
bar.add_yaxis('销售额',category_amount_top['销售额'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="产品类别销售额情况", subtitle='销售额前十产品类别'),
xaxis_opts=opts.AxisOpts(name='产品类别', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销售额', name_location='middle', name_gap=75),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="0cd0495fa04746748323be044d107b4c" style="width:900px; height:500px;"></div>
brand_top = data[data.category_code == 'electronics.smartphone'].groupby('brand').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量', ascending=False).head(5)
brand_top
|
brand |
销量 |
| 14 |
samsung |
51376 |
| 1 |
apple |
23365 |
| 18 |
xiaomi |
8328 |
| 7 |
huawei |
7738 |
| 12 |
oppo |
6876 |
bar = Bar()
bar.add_xaxis(brand_top.brand.tolist())
bar.add_yaxis('销量',brand_top['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="手机品牌销量情况", subtitle='销量前5手机品牌'),
xaxis_opts=opts.AxisOpts(name='手机品牌', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="22e71d28a7cc4bc29596388da0bded6d" style="width:900px; height:500px;"></div>
age_bin_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('age_bin').agg(销量=('buy_count','sum')).reset_index().sort_values('销量',ascending=False)
age_bin_sumsung_phone
|
age_bin |
销量 |
| 5 |
(40-45] |
7651 |
| 6 |
(45-50] |
7640 |
| 3 |
(30-35] |
7593 |
| 2 |
(25-30] |
7320 |
| 0 |
(15-20] |
7273 |
| 1 |
(20-25] |
7125 |
| 4 |
(35-40] |
6774 |
bar = Bar()
bar.add_xaxis(age_bin_sumsung_phone.age_bin.tolist())
bar.add_yaxis('销量',age_bin_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各年龄段情况", subtitle="三星手机用户画像"),
xaxis_opts=opts.AxisOpts(name='年龄段', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),
toolbox_opts=opts.ToolboxOpts())
bar.render_notebook()
<div id="d39450a08c40435d9334a37616a63d8e" style="width:900px; height:500px;"></div>
local_sumsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('local').agg(销量=('buy_count', 'sum')).reset_index().sort_values('销量',ascending=False)
local_sumsung_phone
|
local |
销量 |
| 4 |
广东 |
11491 |
| 0 |
上海 |
8635 |
| 1 |
北京 |
8356 |
| 2 |
四川 |
3231 |
| 9 |
湖南 |
3120 |
| 7 |
海南 |
2874 |
| 10 |
重庆 |
2851 |
| 6 |
浙江 |
2828 |
| 5 |
江苏 |
2717 |
| 3 |
天津 |
2684 |
| 8 |
湖北 |
2589 |
bar = Bar()
bar.add_xaxis(local_sumsung_phone.local.tolist())
bar.add_yaxis('销量',local_sumsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各区域销量情况", subtitle="区域用户画像"),
xaxis_opts=opts.AxisOpts(name='区域', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="591a52be7a6d47e6bd7c33646258990f" style="width:900px; height:500px;"></div>
sex_samsung_phone = data_age[(data_age.brand == 'samsung')&(data_age.category_code == 'electronics.smartphone')].groupby('sex').agg(销量=('buy_count','sum')).reset_index()
sex_samsung_phone
|
sex |
销量 |
| 0 |
女 |
25477 |
| 1 |
男 |
25899 |
bar = Bar()
bar.add_xaxis(sex_samsung_phone.sex.tolist())
bar.add_yaxis('销量',sex_samsung_phone['销量'].tolist())
bar.set_global_opts(title_opts=opts.TitleOpts(title="三星手机用户各性别销量情况", subtitle="性别用户画像"),
xaxis_opts=opts.AxisOpts(name='性别', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='销量', name_location='middle', name_gap=60),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
bar.render_notebook()
<div id="54166c92e64b4d61ae7b039fbf56c68c" style="width:900px; height:500px;"></div>
用户生命周期模型
pivoted_amount = data[data['buy_amount']>0].pivot_table(index='user_id',
columns='month',
values='buy_count',
aggfunc='sum').fillna(0)
pivoted_amount.head()
| month |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
| user_id |
|
|
|
|
|
|
|
|
|
|
|
| 1515915625439951872.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
| 1515915625440038400.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
1.00 |
0.00 |
| 1515915625440051712.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
0.00 |
3.00 |
10.00 |
| 1515915625440099840.00 |
0.00 |
0.00 |
0.00 |
0.00 |
2.00 |
1.00 |
1.00 |
0.00 |
5.00 |
10.00 |
2.00 |
| 1515915625440121600.00 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
0.00 |
1.00 |
0.00 |
0.00 |
0.00 |
0.00 |
columns_month = pivoted_amount.columns.astype('str')
pivoted_amount.columns = columns_month
pivoted_purchase = pivoted_amount.applymap(lambda x: 1 if x>0 else 0)
pivoted_purchase.head()
| month |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
| user_id |
|
|
|
|
|
|
|
|
|
|
|
| 1515915625439951872.00 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
| 1515915625440038400.00 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
0 |
| 1515915625440051712.00 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
1 |
| 1515915625440099840.00 |
0 |
0 |
0 |
0 |
1 |
1 |
1 |
0 |
1 |
1 |
1 |
| 1515915625440121600.00 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
def active_status(data):
status =[]
for i in range(11):
if data[i] ==0:
if len(status)>0:
if status[i-1]=='未注册':
status.append('未注册')
else:
status.append('不活跃')
else:
status.append('未注册')
else:
if len(status)==0:
status.append('新客户')
else:
if status[i-1]=='不活跃':
status.append('回流')
elif status[i-1]=='未注册':
status.append('新客户')
else:
status.append('活跃')
return pd.Series(status,index=columns_month)
pivoted_purchase_status = pivoted_purchase.apply(lambda x: active_status(x), axis=1)
pivoted_purchase_status.head()
| month |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
| user_id |
|
|
|
|
|
|
|
|
|
|
|
| 1515915625439951872.00 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
新客户 |
不活跃 |
不活跃 |
不活跃 |
不活跃 |
| 1515915625440038400.00 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
新客户 |
活跃 |
不活跃 |
| 1515915625440051712.00 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
未注册 |
新客户 |
活跃 |
| 1515915625440099840.00 |
未注册 |
未注册 |
未注册 |
未注册 |
新客户 |
活跃 |
活跃 |
不活跃 |
回流 |
活跃 |
活跃 |
| 1515915625440121600.00 |
未注册 |
未注册 |
未注册 |
未注册 |
新客户 |
不活跃 |
回流 |
不活跃 |
不活跃 |
不活跃 |
不活跃 |
purchase_count = pivoted_purchase_status.apply(lambda x:x.value_counts())
purchase_count = purchase_count[purchase_count.index != "未注册"]
purchase_count = purchase_count.fillna(0)
purchase_count
| month |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
| 不活跃 |
0.00 |
1190.00 |
2346 |
3685 |
5852 |
18118 |
24187 |
40985 |
64665 |
75528 |
84256.00 |
| 回流 |
0.00 |
0.00 |
261 |
432 |
1377 |
1251 |
3462 |
4244 |
4199 |
3740 |
2444.00 |
| 新客户 |
1813.00 |
1613.00 |
1491 |
4177 |
13914 |
8950 |
22036 |
22052 |
8163 |
5607 |
2938.00 |
| 活跃 |
0.00 |
623.00 |
819 |
800 |
1865 |
3639 |
4309 |
8765 |
7182 |
4941 |
3116.00 |
purchase_count = purchase_count.T
purchase_count
|
不活跃 |
回流 |
新客户 |
活跃 |
| month |
|
|
|
|
| 1 |
0.00 |
0.00 |
1813.00 |
0.00 |
| 2 |
1190.00 |
0.00 |
1613.00 |
623.00 |
| 3 |
2346.00 |
261.00 |
1491.00 |
819.00 |
| 4 |
3685.00 |
432.00 |
4177.00 |
800.00 |
| 5 |
5852.00 |
1377.00 |
13914.00 |
1865.00 |
| 6 |
18118.00 |
1251.00 |
8950.00 |
3639.00 |
| 7 |
24187.00 |
3462.00 |
22036.00 |
4309.00 |
| 8 |
40985.00 |
4244.00 |
22052.00 |
8765.00 |
| 9 |
64665.00 |
4199.00 |
8163.00 |
7182.00 |
| 10 |
75528.00 |
3740.00 |
5607.00 |
4941.00 |
| 11 |
84256.00 |
2444.00 |
2938.00 |
3116.00 |
line = Line()
line.add_xaxis(purchase_count.index.tolist())
line.add_yaxis('不活跃',purchase_count['不活跃'].tolist())
line.add_yaxis('活跃',purchase_count['活跃'].tolist())
line.add_yaxis('新客户',purchase_count['新客户'].tolist())
line.add_yaxis('回流',purchase_count['回流'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="用户生命周期分层情况", subtitle="用户分层"),
xaxis_opts=opts.AxisOpts(name='月份', name_location='middle', name_gap=35),
yaxis_opts=opts.AxisOpts(name='分层', name_location='middle', name_gap=60),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()
<div id="5314f0ff88d54f9b9d2dc2234bab163d" style="width:900px; height:500px;"></div>
帕累托模型
user_2_8 = data.groupby('user_id').agg(消费金额=('buy_amount', 'sum')).sort_values('消费金额', ascending=False).reset_index()
user_2_8['累计销售额'] = user_2_8['消费金额'].cumsum()
user_2_8.head()
|
user_id |
消费金额 |
累计销售额 |
| 0 |
1515915625512422912.00 |
160604.07 |
160604.07 |
| 1 |
1515915625513695488.00 |
158277.37 |
318881.44 |
| 2 |
1515915625512377088.00 |
149967.06 |
468848.50 |
| 3 |
1515915625513577472.00 |
135672.84 |
604521.34 |
| 4 |
1515915625514597888.00 |
133945.88 |
738467.22 |
p = user_2_8['消费金额'].cumsum() / user_2_8['消费金额'].sum()
key = p[p>0.8].index[0]
key
25408
key / user_2_8.shape[0]
0.2739259339119185
line = Line()
line.add_xaxis(user_2_8.index.tolist())
line.add_yaxis('累计销售额',user_2_8['累计销售额'].tolist())
line.set_global_opts(title_opts=opts.TitleOpts(title="帕累托模型", subtitle="累计销售额"),
toolbox_opts=opts.ToolboxOpts(item_gap=5))
line.render_notebook()
<div id="6084df0633cf4a048896a735c59a7adf" style="width:900px; height:500px;"></div>
所有评论(0)