2.3 示例代码

2.3.1 调用示例

http://sklearn.apachecn.org/cn/0.19.0/auto_examples/classification/plot_classifier_comparison.html

[1]:

# 导入所需要的库
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap      #颜色渲染
from sklearn.model_selection import train_test_split  # 训练集和测试集的划分
from sklearn.preprocessing import StandardScaler      # 数据标准化
from sklearn.datasets import make_moons, make_circles, make_classification    #三种数据集
from sklearn.neighbors import KNeighborsClassifier    # K近邻分类器
from sklearn.naive_bayes import GaussianNB            # 朴素贝叶斯分类器

# 生成三种数据集
def MakeDateSets():
    X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,random_state=1, n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)
    datasets = [make_moons(noise=0.3, random_state=0),make_circles(noise=0.2, factor=0.5, random_state=1),linearly_separable ]
    return datasets;
# 测试集预测分类结果绘制
def DrawClassifiyResult(classfiler,datasets,DTname):
    h = .02  # step size in the mesh
    figure = plt.figure(figsize=(9, 6))
    i = 1
    # iterate over datasets
    for ds_cnt, ds in enumerate(datasets):
        # preprocess dataset, split into training and test part
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        # just plot the dataset first
        cm = plt.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        ax = plt.subplot(2,len(datasets), i)
        ax.set_title(DTname[i-1])
        # Plot the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        # 调用分类器模型进行训练和预测
        ax = plt.subplot(2,len(datasets), i+3)
        clf = classfiler
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,alpha=0.6)
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right')
        i += 1
    plt.tight_layout()
    plt.show()

datasets = MakeDateSets()
DTname = ['make_moons','make_circles','linearly_separable']

(1) 朴素贝叶斯分类

[2]:

GNBclf = GaussianNB()
DrawClassifiyResult(GNBclf,datasets,DTname)

../../_images/1stPart_Chapter2.PattenFeatureRecognition_comparisonBayesNN23_5_0.png

(2) 最近邻分类

[3]:

KNNclf = KNeighborsClassifier(3)     # 分类器初始化
DrawClassifiyResult(KNNclf,datasets,DTname)

../../_images/1stPart_Chapter2.PattenFeatureRecognition_comparisonBayesNN23_7_0.png

2.3.2 分析示例

[4]:

from sklearn import datasets
# Normalized
def Normalized(x):
    minx=np.min(x)
    maxX=np.max(x)
    x=(x-minx)/(maxX-minx)
    return x;
# transform label of class to -1 and 1
def classtransform(x):
    for i in range(0,len(x)):
        if(x[i]>0):
            x[i]=1
        else:
            x[i]=-1
    return x;
# the datasets of two blobs
def makeblods(n):
    np.random.seed(0)
    X,y = datasets.make_blobs(n_samples=n, centers=2, n_features=2, cluster_std=1, random_state=2)
    rand_X=255*Normalized(X)
    Y=classtransform(y)
    return rand_X,Y;
# the datasets of two two circles
def makecircles(n):
    np.random.seed(0)
    X,y = datasets.make_circles(n_samples=n,factor=0.1,noise=0.1,random_state=2)
    rand_X=255*Normalized(X)
    Y=classtransform(y)
    return rand_X,Y;
def makeGrids():
    x = np.linspace(0,255,256)
    y = np.linspace(0,255,256)
    X,Y = np.meshgrid(x,y)
    Xzip=list(zip(X.flat,Y.flat))
    # Xzip = np.array(Xzip)
    points = [point for point in zip(X.flat,Y.flat)]
    points = np.array(points)
    return Xzip,points;

# Drawing many chart
def DrawManyChart(X,Y):
    #生成两团的高斯和两圈的椭圆数据集
    n_samples=1500
    XB,yb = makeblods(n_samples)
    XC,yc = makecircles(n_samples)
    cor = ListedColormap(['Blue', 'Orange'])
    plt.figure(figsize=(24,10))
    plt.subplot(2,4,1)
    sct = plt.scatter(XB[:,0], XB[:,1], c = yb,s = 10,cmap = cor)
    plt.title('two blob')
    plt.subplot(2,4,5)
    sct = plt.scatter(XC[:,0], XC[:,1], c = yc,s = 10,cmap = cor)
    plt.title('two circle')
    plt.subplot(2,4,2)
    sct = plt.scatter(X[:,0], X[:,1], c=Y[0],s=4,cmap=cor)
    plt.subplot(2,4,6)
    sct = plt.scatter(X[:,0], X[:,1], c=Y[3],s=4,cmap=cor)
    zhengfuCor = [ plt.cm.get_cmap('RdBu'),plt.cm.get_cmap('Blues') ]
    for k in range(0,8):
        idx = k+1
        if (idx>=3 and idx<=4 ):
            plt.subplot(2,4,idx)
            sct = plt.scatter(X[:,0], X[:,1], c=Y[idx-2],s=4,cmap = zhengfuCor[idx-3])
            plt.colorbar(sct)
        if (idx>=7 and idx<=8 ):
            plt.subplot(2,4,idx)
            sct = plt.scatter(X[:,0], X[:,1], c=Y[idx-3],s=4,cmap=zhengfuCor[idx-7])
            plt.colorbar(sct)
    plt.show();
#生成两团的高斯和两圈的椭圆数据集
n_samples=1500
XB,yb = makeblods(n_samples)
XC,yc = makecircles(n_samples)
XAS = [XB, XC]
YAS=[yb,yc]

#离散网格点生成
Xzip,points = makeGrids()
np.random.shuffle(points)
points = np.array(points)

（1）朴素贝叶斯分类结果的形成过程

[5]:

ALLGBc = []
for i in range(0,len(XAS)):
    modelGB = GaussianNB()
    modelGB.fit(XAS[i],YAS[i])
    Lgb = modelGB.predict(points)
    ALLGBc.append(Lgb)
    Postproba = modelGB.predict_proba(points)
    ALLGBc.append(Postproba[:,0] );
    ALLGBc.append(Postproba[:,1] );

DrawManyChart(points,ALLGBc);

../../_images/1stPart_Chapter2.PattenFeatureRecognition_comparisonBayesNN23_11_0.png

上图中第二列为不同数据集的高斯朴素贝叶斯的分类结果。第三列为测试样本中每个点相对于正类的概率。第四列为测试样本中每个点相对于负类的概率。

（2）最近邻分类结果的形成过程

[6]:

ALLKNNc = []
for i in range(0,len(XAS)):
    modelKNC = KNeighborsClassifier(3)
    modelKNC.fit(XAS[i],YAS[i])
    Lknn = modelKNC.predict(points)
    ALLKNNc.append(Lknn)
    dist = modelKNC.kneighbors(points)[0]
    ALLKNNc.append(dist[:,0] );
    ALLKNNc.append(dist[:,1] );

DrawManyChart(points,ALLKNNc);

../../_images/1stPart_Chapter2.PattenFeatureRecognition_comparisonBayesNN23_14_0.png

上图中第二列为不同数据集的最邻近分类结果。第三列为测试样本中相对于正类每个点到训练样本点的最邻近距离。第四列为测试样本中相对与负类每个点到训练样本点的最邻近距离。