1 数据说明
我们仍以sklearn自带的鸢尾花数据集(Iris)为例,在未知Species的情况下,仅根据特征变量来对植物进行分类。
2 导入所需模块
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
import matplotlib.cm as cm
3 获取数据
这里仅获取了iris的特征变量,在本例中不使用iris的目标变量。
# 获取鸢尾花数据集(Iris),将数据分成特征变量和目标变量
iris = datasets.load_iris()
X = iris.data
# 将数据分成训练样本和测试样本
X_train, X_test = train_test_split(X, random_state=0, test_size=0.7)
4 模型训练
# 训练模型
KMean = KMeans(n_clusters=3, random_state=10).fit(X_train)
labels=KMean.labels_
centers=KMean.cluster_centers_
print('聚类后的标签为:', '\n', labels)
print('聚类后的类中心坐标为:', '\n', centers)
聚类后的标签为:
[0 0 0 1 1 0 0 1 0 2 1 2 1 0 1 0 2 0 0 2 0 2 1 1 1 2 2 1 1 0 1 2 1 0 1 1 1
1 0 0 0 2 1 2 0]
聚类后的类中心坐标为:
[[5.1 3.42352941 1.45294118 0.23529412]
[6. 2.77777778 4.32777778 1.36111111]
[7.08 3.14 6.01 2.16 ]]
可以用predict()函数来对新的样本进行分类,这里不再展示分类的结果!
# 预测目标变量
labels_pred = KMean.predict(X_test)
5 模型评价
# 模型评价
silhouette=silhouette_score(X_train, labels, metric='euclidean')
calinski=calinski_harabasz_score(X_train, labels)
davies=davies_bouldin_score(X_train, labels)
print('silhouette:',silhouette,'\n','calinski:',calinski,'\n','davies:',davies)
silhouette: 0.6004717622517666
calinski: 176.46898548716595
davies: 0.6239540921845386
6 模型选择
# 模型选择
range_n_clusters = [2, 3, 4, 5, 6]
ks1=np.array([])
ks2=np.array([])
ks3=np.array([])
for n_clusters in range_n_clusters:
clusterer = KMeans(n_clusters=n_clusters, random_state=10).fit(X_train)
cluster_labels = clusterer.labels_
silhouette_avg = silhouette_score(X_train, cluster_labels)
calinski_avg = calinski_harabasz_score(X_train, cluster_labels)
davies_avg = davies_bouldin_score(X_train, cluster_labels)
ks1 = np.append(ks1,silhouette_avg)
ks2 = np.append(ks2, calinski_avg)
ks3 = np.append(ks3, davies_avg)
ks_score = pd.DataFrame(index=range(2,7))
ks_score.insert(0,column='silhouette_avg',value=ks1)
ks_score.insert(0,column='calinski_avg',value=ks2)
ks_score.insert(0,column='davies_avg',value=ks3)
print(ks_score)
7 二维图形展示
鸢尾花数据集(Iris)有4个特征变量,因而无法绘制出svm在6维空间的分类图形。这里仅取前2个特征变量,演示K均值聚类在2维空间上的聚类图示。
# 绘制2维图形
# 以iris数据的前两个特征来分类。
KMean2 = KMeans(n_clusters=3, random_state=10).fit(X[:, 0:2])
labels_2d=KMean2.labels_
centers_2d=KMean2.cluster_centers_
colors = cm.nipy_spectral(labels_2d.astype(float) / 3)
# 绘制样本散点
plt.scatter(
X[:, 0], X[:, 1],
marker=".",
s=100,
lw=0,
alpha=0.7,
c=colors,
edgecolor="k"
)
# 绘制类中心
plt.scatter(
centers_2d[:, 0],
centers_2d[:, 1],
marker="o",
c="white",
alpha=1,
s=100,
edgecolor="k",
)
plt.show()