1 数据说明
我们仍以sklearn自带的鸢尾花数据集(Iris)为例,在未知Species的情况下,仅根据特征变量来对植物进行分类。
2 导入所需模块
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score
import matplotlib.cm as cm
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
3 获取数据
这里仅获取了iris的特征变量,在本例中不使用iris的目标变量。另外,层次聚类主要适用于小样本,且无法根据训练好的模型对新样本分类。所以这里调用train_test_split()函数主要目的是缩小样本,用于演示谱系图(样本太多谱系图将太密集,无法看清细节)。
# 获取鸢尾花数据集(Iris),将数据分成特征变量和目标变量
iris = datasets.load_iris()
X = iris.data
# 将数据分成训练样本和测试样本
X_train, X_test = train_test_split(X, random_state=0, test_size=0.83)
4 模型训练
# 训练模型
HiCluster = AgglomerativeClustering(n_clusters=3).fit(X_train)
labels=HiCluster.labels_
print('聚类后的标签为:', '\n', labels)
聚类后的标签为:
[1 0 2 2 2 0 0 0 0 1 2 0 0 1 0 2 0 0 1 1 1 0 2 0 1]
注意:层次聚类无法根据训练好的模型对新样本分类!
5 模型评价
# 模型评价
silhouette=silhouette_score(X_train, labels, metric='euclidean')
calinski=calinski_harabasz_score(X_train, labels)
davies=davies_bouldin_score(X_train, labels)
print('silhouette:',silhouette,'\n','calinski:',calinski,'\n','davies:',davies)
silhouette: 0.546594974770782
calinski: 80.83416383906932
davies: 0.5120802719618535
6 模型选择
一是根据谱系图来选择聚类数:
绘制谱系图时,需要重新估计不指定聚类数目的模型。
# 1.通过图形选择
HiCluster_none = AgglomerativeClustering(distance_threshold=0, n_clusters=None).fit(X_train)
def plot_dendrogram(model, **kwargs):
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# 绘制谱系图
dendrogram(linkage_matrix, **kwargs)
plot_dendrogram(HiCluster_none, truncate_mode=None)
plt.show()
二是根据评价指标来选择聚类数:
# 2.通过分类结果来选择
range_n_clusters = [2, 3, 4, 5, 6]
ks1=np.array([])
ks2=np.array([])
ks3=np.array([])
for n_clusters in range_n_clusters:
clusterer = AgglomerativeClustering(n_clusters=n_clusters).fit(X_train)
cluster_labels = clusterer.labels_
silhouette_avg = silhouette_score(X_train, cluster_labels)
calinski_avg = calinski_harabasz_score(X_train, cluster_labels)
davies_avg = davies_bouldin_score(X_train, cluster_labels)
ks1 = np.append(ks1,silhouette_avg)
ks2 = np.append(ks2, calinski_avg)
ks3 = np.append(ks3, davies_avg)
ks_score = pd.DataFrame(index=range(2,7))
ks_score.insert(0,column='silhouette_avg',value=ks1)
ks_score.insert(0,column='calinski_avg',value=ks2)
ks_score.insert(0,column='davies_avg',value=ks3)
print(ks_score)
7 二维图形展示
鸢尾花数据集(Iris)有4个特征变量,因而无法绘制出svm在6维空间的分类图形。这里仅取前2个特征变量,演示K均值聚类在2维空间上的聚类图示。
# 绘制2维图形
# 以iris数据的前两个特征来分类。
HiCluster2 = AgglomerativeClustering(n_clusters=3).fit(X[:, 0:2])
labels_2d=HiCluster2.labels_
colors = cm.nipy_spectral(labels_2d.astype(float) / 3)
# 绘制样本散点
plt.scatter(
X[:, 0], X[:, 1],
marker=".",
s=100,
lw=0,
alpha=0.7,
c=colors,
edgecolor="k"
)
plt.show()