#数据降维可视化 from sklearn.manifold import TSNE tsne = TSNE(n_compOnents=2, init='pca', random_state=0) result = tsne.fit_transform(numeric) x_min, x_max = np.min(result, 0), np.max(result, 0) #对结果进行标准化 result = (result - x_min) / (x_max - x_min) label = train_data['label'] fig = plt.figure(figsize = (7, 7)) #f , ax = plt.subplots() color = {'o':0, 'n':7}#8种颜色 for i in range(result.shape[0]): plt.text(result[i, 0], result[i, 1], str(label[i]), color=plt.cm.Set1(color[label[i]] / 10.),#颜色 fOntdict={'weight': 'bold', 'size': 9})#字体 plt.xticks([]) plt.yticks([]) plt.title('Visualization of data dimension reduction')
可以看到上面的图非常奇怪。。。 确实也分开了,接下来使用pca看看效果:
PCA提取特征 首先用pyod库生成example,这里选择生成30维的数据
import numpy as np import pandas as pd import matplotlib.pyplot as plt import pyod as py from pyod.utils.data import generate_data from pyod.models.pca import PCA from pyod.utils.example import visualize from pyod.utils.data import evaluate_print import seaborn as sns #生成数据 维度30 x_train,y_train=generate_data( n_train=500, n_features=30, train_Only=True,behaviour='old',offset=10) outlier_fraction = 0.1
再将数据转换成dataframe格式,看看数据之间的关联性
#转换为dataframe df_train = pd.DataFrame(x_train) numeric_features = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29] print(numeric_features) numeric = df_train[numeric_features] correlation = numeric.corr()#相关系数 f , ax = plt.subplots(figsize = (14, 14)) sns.heatmap(correlation,square = True) plt.title('Correlation of Numeric Features with Price',y=1,size=16) plt.show() f = pd.melt(df_train, value_vars=numeric_features)#列数据合并为行数据 print(f) g = sns.FacetGrid(f, col="variable", col_wrap=6, sharex=False, sharey=False)#结构化多绘图网格,col_wrap=6-6列,是否共享x轴或者y轴 g = g.map(sns.distplot, "value", hist=False, rug=True) sns.set() #主要展现的是变量两两之间的关系 sns.pairplot(df_train[numeric_features],size = 2 ,kind ='scatter',diag_kind='kde')#对于特征使用散点图 plt.savefig('correlation.png') plt.show()
随后对数据进行PCA分析
df_train['y'] = y_train #定义异常值点 x_outliers, x_inliers = py.utils.data.get_outliers_inliers(x_train,y_train) n_inliers = len(x_inliers) n_outliers = len(x_outliers) sns.scatterplot(x=0, y=1, hue='y', data=df_train) plt.show() xx , yy = np.meshgrid(np.linspace(-10, 10, 200), np.linspace(-10, 10, 200)) #使用PCA classifiers = { 'Principal component analysis (PCA)': PCA(n_compOnents=2,cOntamination=outlier_fraction) } for i, (clf_name, clf) in enumerate(classifiers.items()): clf.fit(x_train) y_train_pred = clf.labels_ # binary labels (0: inliers, 1: outliers) y_train_scores = clf.decision_scores_ # raw outlier scores # get the prediction on the test data clf_name = 'PCA' print("\nOn Training Data:") evaluate_print(clf_name, y_train, y_train_scores) # print(precision_score(y_train,y_train_pred)) sns.scatterplot(x=0, y=1, hue=y_train_scores, data=df_train, palette='RdBu_r') plt.show()