2.3 示例代码
2.3.1 调用示例
http://sklearn.apachecn.org/cn/0.19.0/auto_examples/classification/plot_classifier_comparison.html
[1]:
# 导入所需要的库
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap #颜色渲染
from sklearn.model_selection import train_test_split # 训练集和测试集的划分
from sklearn.preprocessing import StandardScaler # 数据标准化
from sklearn.datasets import make_moons, make_circles, make_classification #三种数据集
from sklearn.neighbors import KNeighborsClassifier # K近邻分类器
from sklearn.naive_bayes import GaussianNB # 朴素贝叶斯分类器
# 生成三种数据集
def MakeDateSets():
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = [make_moons(noise=0.3, random_state=0),make_circles(noise=0.2, factor=0.5, random_state=1),linearly_separable ]
return datasets;
# 测试集预测分类结果绘制
def DrawClassifiyResult(classfiler,datasets,DTname):
h = .02 # step size in the mesh
figure = plt.figure(figsize=(9, 6))
i = 1
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
# just plot the dataset first
cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = plt.subplot(2,len(datasets), i)
ax.set_title(DTname[i-1])
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
# 调用分类器模型进行训练和预测
ax = plt.subplot(2,len(datasets), i+3)
clf = classfiler
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right')
i += 1
plt.tight_layout()
plt.show()
datasets = MakeDateSets()
DTname = ['make_moons','make_circles','linearly_separable']
(1) 朴素贝叶斯分类
[2]:
GNBclf = GaussianNB()
DrawClassifiyResult(GNBclf,datasets,DTname)
(2) 最近邻分类
[3]:
KNNclf = KNeighborsClassifier(3) # 分类器初始化
DrawClassifiyResult(KNNclf,datasets,DTname)
2.3.2 分析示例
[4]:
from sklearn import datasets
# Normalized
def Normalized(x):
minx=np.min(x)
maxX=np.max(x)
x=(x-minx)/(maxX-minx)
return x;
# transform label of class to -1 and 1
def classtransform(x):
for i in range(0,len(x)):
if(x[i]>0):
x[i]=1
else:
x[i]=-1
return x;
# the datasets of two blobs
def makeblods(n):
np.random.seed(0)
X,y = datasets.make_blobs(n_samples=n, centers=2, n_features=2, cluster_std=1, random_state=2)
rand_X=255*Normalized(X)
Y=classtransform(y)
return rand_X,Y;
# the datasets of two two circles
def makecircles(n):
np.random.seed(0)
X,y = datasets.make_circles(n_samples=n,factor=0.1,noise=0.1,random_state=2)
rand_X=255*Normalized(X)
Y=classtransform(y)
return rand_X,Y;
def makeGrids():
x = np.linspace(0,255,256)
y = np.linspace(0,255,256)
X,Y = np.meshgrid(x,y)
Xzip=list(zip(X.flat,Y.flat))
# Xzip = np.array(Xzip)
points = [point for point in zip(X.flat,Y.flat)]
points = np.array(points)
return Xzip,points;
# Drawing many chart
def DrawManyChart(X,Y):
#生成两团的高斯和两圈的椭圆数据集
n_samples=1500
XB,yb = makeblods(n_samples)
XC,yc = makecircles(n_samples)
cor = ListedColormap(['Blue', 'Orange'])
plt.figure(figsize=(24,10))
plt.subplot(2,4,1)
sct = plt.scatter(XB[:,0], XB[:,1], c = yb,s = 10,cmap = cor)
plt.title('two blob')
plt.subplot(2,4,5)
sct = plt.scatter(XC[:,0], XC[:,1], c = yc,s = 10,cmap = cor)
plt.title('two circle')
plt.subplot(2,4,2)
sct = plt.scatter(X[:,0], X[:,1], c=Y[0],s=4,cmap=cor)
plt.subplot(2,4,6)
sct = plt.scatter(X[:,0], X[:,1], c=Y[3],s=4,cmap=cor)
zhengfuCor = [ plt.cm.get_cmap('RdBu'),plt.cm.get_cmap('Blues') ]
for k in range(0,8):
idx = k+1
if (idx>=3 and idx<=4 ):
plt.subplot(2,4,idx)
sct = plt.scatter(X[:,0], X[:,1], c=Y[idx-2],s=4,cmap = zhengfuCor[idx-3])
plt.colorbar(sct)
if (idx>=7 and idx<=8 ):
plt.subplot(2,4,idx)
sct = plt.scatter(X[:,0], X[:,1], c=Y[idx-3],s=4,cmap=zhengfuCor[idx-7])
plt.colorbar(sct)
plt.show();
#生成两团的高斯和两圈的椭圆数据集
n_samples=1500
XB,yb = makeblods(n_samples)
XC,yc = makecircles(n_samples)
XAS = [XB, XC]
YAS=[yb,yc]
#离散网格点生成
Xzip,points = makeGrids()
np.random.shuffle(points)
points = np.array(points)
(1)朴素贝叶斯分类结果的形成过程
[5]:
ALLGBc = []
for i in range(0,len(XAS)):
modelGB = GaussianNB()
modelGB.fit(XAS[i],YAS[i])
Lgb = modelGB.predict(points)
ALLGBc.append(Lgb)
Postproba = modelGB.predict_proba(points)
ALLGBc.append(Postproba[:,0] );
ALLGBc.append(Postproba[:,1] );
DrawManyChart(points,ALLGBc);
上图中第二列为不同数据集的高斯朴素贝叶斯的分类结果。 第三列为测试样本中每个点相对于正类的概率。 第四列为测试样本中每个点相对于负类的概率。
(2)最近邻分类结果的形成过程
[6]:
ALLKNNc = []
for i in range(0,len(XAS)):
modelKNC = KNeighborsClassifier(3)
modelKNC.fit(XAS[i],YAS[i])
Lknn = modelKNC.predict(points)
ALLKNNc.append(Lknn)
dist = modelKNC.kneighbors(points)[0]
ALLKNNc.append(dist[:,0] );
ALLKNNc.append(dist[:,1] );
DrawManyChart(points,ALLKNNc);
上图中第二列为不同数据集的最邻近分类结果。 第三列为测试样本中相对于正类每个点到训练样本点的最邻近距离。 第四列为测试样本中相对与负类每个点到训练样本点的最邻近距离。