数据可视化及预测，以及一些异常值检测算法

一颗西柚子

432人浏览 · 2022-06-09 17:07:41

一颗西柚子 · 2022-06-09 17:07:41 发布

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 因为数据不太有规律，这里测试了一下，农产品编号和批发市场编号如下是效果明显一点
# 也可以选择下面的手动输入
nd = '13084'
nm = '20690'
# nd = input('请输入农产品编号')
# nm = input('请输入批发市场编号')

# 某商品每月价格
df_m = pd.read_excel('monthly_price_historical.xlsx')
df_m1 = df_m.loc[df_m['农产品编号']==nd]
df_m2 = df_m1.loc[df_m1['批发市场编号']==nm]

s = []
time = []
for i in df_m2.values:
    g = 0
    for t in i[3:]:
        g=g+1
        if t!='-':
            s.append(float(t))
          #  time.append(i[2][2:]+str(g))

plt.figure(figsize=(14,6))
plt.title('商品月销售价格波动曲线')
plt.xlabel('时间')
plt.ylabel('月销售价格')
plt.plot(s)
plt.show()
x =np.arange(len(s))
width = 0.5  # 设置柱子的宽度和间隔
plt.figure(figsize=(14,6))
# bar参数说明:前两个分别为x和y; width: 表示柱子宽度; label: 图例名称,所代表的类别; fc: 设置颜色
plt.bar(x,s, width=width, label='19639279',fc = 'b')
plt.title('商品月销售价格波动曲线')
plt.xlabel('时间')
plt.ylabel('月销售价格')
plt.show()
# 一元和三元线性回归预测
n = int(len(s)*0.8)
X_train = np.array(x[:n]).reshape(len(x[:n]),1)
y_train = np.array(s[:n]).reshape(len(s[:n]),1)
X_test = np.array(x[n:]).reshape(len(x[n:]),1)
y_test = np.array(s[n:]).reshape(len(s[n:]),1)

# 简单线性回归
model = LinearRegression()
model.fit(X_train, y_train)
xx = np.linspace(0, len(x), len(x))
yy = model.predict(xx.reshape(xx.shape[0], 1))
plt.figure(figsize=(12,6))
plt.scatter(x=X_train, y=y_train, color='k')
plt.scatter(x=X_test,y=y_test,c='b')
plt.plot(xx, yy, 'g')

#多项式回归
quadratic_featurizer = PolynomialFeatures(degree=3)
X_train_quadratic = quadratic_featurizer.fit_transform(X_train)
X_test_quadratic = quadratic_featurizer.fit_transform(X_test)
model2 = LinearRegression()
model2.fit(X_train_quadratic, y_train)
xx2 = quadratic_featurizer.transform(xx[:, np.newaxis])
yy2 = model2.predict(xx2)
plt.plot(xx, yy2, '-r')
plt.title('一元线性回归和多元非线性回归')

plt.show()
#孤立森林

s = np.array(s).reshape(len(s),1)
from sklearn.ensemble import IsolationForest
iforest = IsolationForest(n_estimators = 5, contamination = 0.1, max_samples ='auto')
prediction = iforest.fit_predict(s)
print("孤立森林算法检测异常价格月数: {}".format(prediction[prediction < 0].sum()))
print("孤立森林算法检测正常价格月数: {}".format(prediction[prediction > 0].sum()))
normal_data_y = s[prediction > 0]
normal_data_x=np.where(prediction==1)
outliers_y = s[prediction < 0]
outliers_x = np.where(prediction==-1)
plt.figure(figsize=(16,8))
plt.scatter(normal_data_x, normal_data_y)
plt.scatter(outliers_x, outliers_y)
plt.title("孤立森林算法")
plt.show()
# 3sigma原则

t_price = pd.DataFrame(s)
def three_sigma(t_price):
    rule = ((t_price.mean()-2*t_price.std())>t_price) | ((t_price.mean()+2*t_price.std()<t_price))
    out_price = t_price[rule]
    return out_price
out_price = three_sigma(t_price)
out_x=[]
out_y=[]
normal_x=[]
normal_y=[]
for i,j in zip(out_price.values,range(out_price.shape[0])):
    if i>0:
        out_x.append(j)
        out_y.append(i)
    else:
        normal_x.append(j)
        normal_y.append(s[j])
plt.figure(figsize=(16,8))
plt.title('3sigma原则')
plt.scatter(normal_x, normal_y,c='g')
plt.scatter(out_x, out_y,c='r')
plt.show()
print('sigma原则检测异常价格：')
print(out_y)

# 每日价格
df_d = pd.read_excel('daily_price.xlsx')
df_d1 = df_d.loc[df_d['农产品编号']==nd]
df_d2 = df_d1.loc[df_d1['批发市场编号']==nm]
df_d2x = df_d2['日期']
df_d2y = df_d2['平均价格']
x = []
for i in df_d2x:
    x.append(i[5:10])

plt.figure(figsize=(40,6))
plt.plot(x,df_d2y)
plt.xlabel('时间')
plt.ylabel('日销售价格变化')

plt.show()
xxp2 = quadratic_featurizer.transform(xx[n:, np.newaxis])
yyp2 = model2.predict(xxp2)
print('训练集和测试集按照8：2的比例：')
print('这里预测的是每月的销售价格')
print(yyp2)

魔乐社区

魔乐社区（Modelers.cn) 是一个中立、公益的人工智能社区，提供人工智能工具、模型、数据的托管、展示与应用协同服务，为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作，由全产业链共同建设、共同运营、共同享有，推动国产AI生态繁荣发展。

更多推荐

【计算机视觉】Pixel逐像素分类&Mask掩码分类理解摘要

魔乐社区

计算机视觉（opencv）实战三十二——CascadeClassifier 人脸微笑检测（摄像头）

本文从原理到实现，详细介绍了基于 OpenCV Haar 分类器的人脸与微笑检测：讲解了 Haar 特征和级联检测原理。对代码逐行拆解并解释参数含义。画出完整流程图，帮助理解执行过程。给出了常见问题和优化建议，甚至扩展到深度学习方法。这种方法简单、轻量、实时性好，非常适合入门和小型应用项目。但如果需要更高准确率和更强鲁棒性，建议使用深度学习检测器替代 Haar 分类器。