tushare ID:441914
我是用jupyter做的分析,先导入相关的库,记得设置tushare的token。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import matplotlib.cm as cm #colormap

from sklearn.preprocessing import StandardScaler

from sklearn import cluster, covariance, manifold
from sklearn.metrics import silhouette_score # 轮廓系数:评价聚类好坏 越接近1越好
from sklearn.metrics import silhouette_samples
from sklearn.metrics import calinski_harabasz_score# 卡林斯基-哈拉巴斯指数,越高越好,评价聚类好坏
from sklearn.covariance import ShrunkCovariance
from sklearn.covariance import LedoitWolf #收缩协方差估计

import tushare as ts
with open('token.txt') as f:
    token=f.read()
ts.set_token(token)
pro = ts.pro_api()

第一步定义相关的方法获取数据,并截取2017-12-31前上市的股票

# No.1
# 数据获取
#%matplotlib inline #Jupyter Notebook显示图形专用
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

#获取股票代码和名称
def get_code():
	code=pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,name,area,industry,list_date')
    code[['list_date']]= code[['list_date']].values.astype(str)
    code['list_date']=(pd.to_datetime(code['list_date'])).dt.date
    code.set_index(['list_date'],inplace=True)

    from datetime import datetime as datetime_
    #截取
    cut='2017-12-31'
    cut=datetime_.strptime(cut,'%Y-%m-%d') 
    cut=datetime_.date(cut)
    code.index<=cut
    code=code.loc[code.index<=cut,:]
    
    print('len',len(code))
    codes=code.ts_code.values
    names=code.Name.values
    stocks=dict(zip(codes,names))
    return stocks

def get_data(code,start='20130101',end='20210331'):
    df=ts.pro_bar(ts_code=code,adj='qfq',start_date=start, end_date=end)
    df.index=pd.to_datetime(df.trade_date)
    df=df.sort_index()
    return df

def data_process(data):

    df=data.copy()
    df.set_index(['trade_date'],inplace=True)
    # 数据筛选
    df1=df.copy()
    dd=df1.isna().sum()
    dd=dd.sort_values(axis = 0,ascending = False)
    half = len(dd)// 2
    mid=dd[half] + dd[~half]/2
    indexs=dd[dd>112].index #此时为nan个数方差在均值附近
    print(dd.describe())
    df1=df1.drop(indexs,axis=1)
    df1=df1.dropna()
    variation=df1.dropna().values #保留共同交易日
    variation=df1.values
    X = variation.copy()
    X /= X.std(axis=0)
    print(X.shape)
    return X,indexs,df1

def get_key (dict, value):
    return [k for k, v in dict.items() if v == value]

codes, names = np.array(sorted(get_code().items())).T  #对所有可迭代的对象进行排序,再转置为两行

data=pd.DataFrame({name:(get_data(code).close-get_data(code).open) 
                    for code,name in zip(codes,names)})

第二步进行数据处理,剔除空数据,调整输出格式

# No.2
# 数据处理					
X,indexs,df=data_process(data)

%调整后面将要输出的格式
stocks=get_code()
for index in indexs:
    for name,code in stocks.items():
        if name == index:
            del stocks[code]
            break

第三步训练数据

# No.3
# 相关系数
edge_model = LedoitWolf()
assume_centered=True
edge_model.fit(X) 
_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()	

第四步查看结果

# No.4
#  结果

print(n_labels)            

for i in range(n_labels + 1):
    print('Cluster %i:'% (i + 1),end='')
    for name in names[labels == i]:
        key=get_key(stocks,name)
        for k in key:
            print(' ',k,'-',stocks[k],end=',')
    print()

第五步对聚类结果进行评价+评价可视化

# # No.5
# # 聚类评价可视化

n_clusters=n_labels+1
X=X.T
#创建画布,画布上有两个图
fig,ax1=plt.subplots(1,1)
fig.set_size_inches(20,65)

#图一是轮廓系数图像,由各轮廓系数组成的横向条形图
#横坐标是轮廓系数取值,纵坐标是样本,因为轮廓系数是对每一个样本进行计算
#轮廓系数取值为[-1,1],取轮廓系数>0的,所以支取[-0.1,1]
ax1.set_xlim([-0.1,0.5])

#纵坐标[0,X.shape[0]],且每个簇排在一起,不同簇间有空隙,所以在X.shape[0]上加上距离(n_clusters+1)*10作为间隙
ax1.set_ylim([0,X.shape[0]+(n_clusters+1)*20])

print('silhouette_score: ',silhouette_score(X,labels))
print('silhouette_samples>0: ',(silhouette_samples(X,labels)>0).sum())
print('calinski_harabasz_score: ',calinski_harabasz_score(X,labels))

silhouette_score_=silhouette_score(X,labels)
print('For n_clusters = ',n_clusters,'The average silhouette_score is :',silhouette_score_)
#每个样本点的轮廓系数(横坐标)
silhouette_samples_=silhouette_samples(X,labels)

# 设定y轴上的初始取值
y_lower=10

for i in range(n_clusters):
    ith_cluster_silhouette_samples_ = silhouette_samples_[labels == i] #去第i簇轮廓系数
    ith_cluster_silhouette_samples_.sort()
    size_cluster_i =ith_cluster_silhouette_samples_.shape[0] #一簇中样本个数
    y_upper = y_lower + size_cluster_i
#     print(y_upper)

    #colormap中nipy_spectral(小数)用小数调用颜色函数    
    color = cm.nipy_spectral(float(i)/n_clusters)

    #填充图1
    #fill_between填充曲线与之交间的空间
    #fill_betweenx的直角是在纵坐标上
    #fill_betweeny的直角是在横坐标上
    #fill_betweenx(纵坐标下限,纵坐标上限,,x轴的取值柱状图颜色)
    ax1.fill_betweenx(np.arange(y_lower,y_upper)
                      ,ith_cluster_silhouette_samples_
                      ,facecolor=color
                      ,alpha=0.7
                     )
    #簇上写编号
    #text(显示编号位置的横坐标,显示编号位置的纵坐标,内容)
    ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    y_lower = y_upper + 20  # 10 for the 0 samples
    
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

# 标上轮廓系数的虚线
ax1.axvline(x=silhouette_score_, color="red", linestyle="--")
#y不显示刻度
ax1.set_yticks([])  
ax1.set_xticks([-0.1,0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40])

plt.suptitle(("Silhouette analysis for GraphicalLassoCV clustering on sample data "
              "with n_clusters = %d" % n_clusters),
             fontsize=14, fontweight='bold')

plt.show()

第六步对数据进行降维且对聚类结果可视化

# No.6
# 可视化
#计算低维
node_position_model = manifold.LocallyLinearEmbedding(
    n_components=2, eigen_solver='dense', n_neighbors=6)

embedding = node_position_model.fit_transform(X).T

# 可视化
plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')

# 计算偏相关系数
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

# 使用嵌入的坐标绘制节点
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,cmap=plt.cm.nipy_spectral)

# 画相互关联的边
start_idx, end_idx = np.where(non_zero)
segments = [[embedding[:, start], embedding[:, stop]]
            for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.hot_r,
                    norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

#向每个节点添加一个标签,难点在于定位标签,以避免与其他标签重叠
for index, (name, label, (x, y)) in enumerate(
        zip(names, labels, embedding.T)):

    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = 'left'
        x = x + .002
    else:
        horizontalalignment = 'right'
        x = x - .002
    if this_dy > 0:
        verticalalignment = 'bottom'
        y = y + .002
    else:
        verticalalignment = 'top'
        y = y - .002
    plt.text(x, y, name, size=10,
             horizontalalignment=horizontalalignment,
             verticalalignment=verticalalignment,
             bbox=dict(facecolor='w',
                       edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
                       alpha=.6))

plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
         embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
         embedding[1].max() + .03 * embedding[1].ptp())

plt.show()

最终成果部分截图
在这里插入图片描述
在这里插入图片描述在这里插入图片描述
学习不易,数据分析新人初来乍到,请大家多多指教。
最后感谢tushare平台,让我们获取股票的数据和进行研究更方便啦!

Logo

魔乐社区(Modelers.cn) 是一个中立、公益的人工智能社区,提供人工智能工具、模型、数据的托管、展示与应用协同服务,为人工智能开发及爱好者搭建开放的学习交流平台。社区通过理事会方式运作,由全产业链共同建设、共同运营、共同享有,推动国产AI生态繁荣发展。

更多推荐