数据可视化及预测,以及一些异常值检测算法
w
·
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 因为数据不太有规律,这里测试了一下,农产品编号和批发市场编号如下是效果明显一点
# 也可以选择下面的手动输入
nd = '13084'
nm = '20690'
# nd = input('请输入农产品编号')
# nm = input('请输入批发市场编号')
# 某商品每月价格
df_m = pd.read_excel('monthly_price_historical.xlsx')
df_m1 = df_m.loc[df_m['农产品编号']==nd]
df_m2 = df_m1.loc[df_m1['批发市场编号']==nm]
s = []
time = []
for i in df_m2.values:
g = 0
for t in i[3:]:
g=g+1
if t!='-':
s.append(float(t))
# time.append(i[2][2:]+str(g))
plt.figure(figsize=(14,6))
plt.title('商品月销售价格波动曲线')
plt.xlabel('时间')
plt.ylabel('月销售价格')
plt.plot(s)
plt.show()
x =np.arange(len(s))
width = 0.5 # 设置柱子的宽度和间隔
plt.figure(figsize=(14,6))
# bar参数说明:前两个分别为x和y; width: 表示柱子宽度; label: 图例名称,所代表的类别; fc: 设置颜色
plt.bar(x,s, width=width, label='19639279',fc = 'b')
plt.title('商品月销售价格波动曲线')
plt.xlabel('时间')
plt.ylabel('月销售价格')
plt.show()
# 一元和三元线性回归预测
n = int(len(s)*0.8)
X_train = np.array(x[:n]).reshape(len(x[:n]),1)
y_train = np.array(s[:n]).reshape(len(s[:n]),1)
X_test = np.array(x[n:]).reshape(len(x[n:]),1)
y_test = np.array(s[n:]).reshape(len(s[n:]),1)
# 简单线性回归
model = LinearRegression()
model.fit(X_train, y_train)
xx = np.linspace(0, len(x), len(x))
yy = model.predict(xx.reshape(xx.shape[0], 1))
plt.figure(figsize=(12,6))
plt.scatter(x=X_train, y=y_train, color='k')
plt.scatter(x=X_test,y=y_test,c='b')
plt.plot(xx, yy, 'g')
#多项式回归
quadratic_featurizer = PolynomialFeatures(degree=3)
X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
X_test_quadratic = quadratic_featurizer.fit_transform(X_test)
model2 = LinearRegression()
model2.fit(X_train_quadratic, y_train)
xx2 = quadratic_featurizer.transform(xx[:, np.newaxis])
yy2 = model2.predict(xx2)
plt.plot(xx, yy2, '-r')
plt.title('一元线性回归和多元非线性回归')
plt.show()
#孤立森林
s = np.array(s).reshape(len(s),1)
from sklearn.ensemble import IsolationForest
iforest = IsolationForest(n_estimators = 5, contamination = 0.1, max_samples ='auto')
prediction = iforest.fit_predict(s)
print("孤立森林算法检测异常价格月数: {}".format(prediction[prediction < 0].sum()))
print("孤立森林算法检测正常价格月数: {}".format(prediction[prediction > 0].sum()))
normal_data_y = s[prediction > 0]
normal_data_x=np.where(prediction==1)
outliers_y = s[prediction < 0]
outliers_x = np.where(prediction==-1)
plt.figure(figsize=(16,8))
plt.scatter(normal_data_x, normal_data_y)
plt.scatter(outliers_x, outliers_y)
plt.title("孤立森林算法")
plt.show()
# 3sigma原则
t_price = pd.DataFrame(s)
def three_sigma(t_price):
rule = ((t_price.mean()-2*t_price.std())>t_price) | ((t_price.mean()+2*t_price.std()<t_price))
out_price = t_price[rule]
return out_price
out_price = three_sigma(t_price)
out_x=[]
out_y=[]
normal_x=[]
normal_y=[]
for i,j in zip(out_price.values,range(out_price.shape[0])):
if i>0:
out_x.append(j)
out_y.append(i)
else:
normal_x.append(j)
normal_y.append(s[j])
plt.figure(figsize=(16,8))
plt.title('3sigma原则')
plt.scatter(normal_x, normal_y,c='g')
plt.scatter(out_x, out_y,c='r')
plt.show()
print('sigma原则检测异常价格:')
print(out_y)
# 每日价格
df_d = pd.read_excel('daily_price.xlsx')
df_d1 = df_d.loc[df_d['农产品编号']==nd]
df_d2 = df_d1.loc[df_d1['批发市场编号']==nm]
df_d2x = df_d2['日期']
df_d2y = df_d2['平均价格']
x = []
for i in df_d2x:
x.append(i[5:10])
plt.figure(figsize=(40,6))
plt.plot(x,df_d2y)
plt.xlabel('时间')
plt.ylabel('日销售价格变化')
plt.show()
xxp2 = quadratic_featurizer.transform(xx[n:, np.newaxis])
yyp2 = model2.predict(xxp2)
print('训练集和测试集按照8:2的比例:')
print('这里预测的是每月的销售价格')
print(yyp2)

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。
更多推荐
所有评论(0)