1 问题描述
现在有8条新闻的文本文件,这里采用文本聚类方法,将8条新闻进行聚类。8个文档存放在Python工作目录下。
2 导入所需模块
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import dendrogram
3 读取新闻文本文档
documents = []
dirPath='./news'
fileList = os.listdir(dirPath)
for f in fileList:
f = dirPath+'/'+f
print(f)
news = open(f,'r+',encoding = 'gbk').read()
documents.append(news)
可以看到,8个新闻文档的路径如下:
4 对新闻文档进行分词、去停用词
# 停用词表
stopwords = [line.strip() for line in open('百度停词表.txt', 'r', encoding='utf-8').readlines()]
doc_words = []
for document in documents:
# 采用jieba进行分词、词性筛选、去停用词
doc=""
for word in jieba.lcut(document.strip()):
if word not in stopwords and word not in ["—","\n"," "," ","]","["]:
doc=doc+" "+word
doc_words.append(doc)
5 基于词频矩阵的文本聚类
词频矩阵给出了每个文档中每个词出现的个数,据此来进行聚类!
count_vectorizer = CountVectorizer(min_df=3) # 将出现频数低于3的词去除!
cv = count_vectorizer.fit_transform(doc_words)
matrix = cv.toarray()
HiCluster = AgglomerativeClustering(n_clusters=3).fit(matrix) # 设置聚类数目为3,当然也可以尝试其他聚类数。
labels=HiCluster.labels_
print('聚类后的标签为:', '\n', labels)
聚类结果如下所示,其中第3和第4条新闻单独一类,其余为一类!
聚类后的标签为: [0 0 2 1 0 0 0 0]
6 基于TF-IDF词频矩阵的文本聚类
TF-IDF词频矩阵给出了每个文档中每个词出现的频率,据此来进行聚类!与直接采用词频矩阵进行聚类结果会产生不同。
tf_idf_vectorizer = TfidfVectorizer(min_df=0.3)
tf_idf = tf_idf_vectorizer.fit_transform(doc_words)
matrix_tf_idf = tf_idf.toarray()
HiCluster = AgglomerativeClustering(n_clusters=3).fit(matrix_tf_idf)
labels=HiCluster.labels_
print('聚类后的标签为:', '\n', labels)
聚类结果如下所示,其中第1、3、5条新闻聚成一类,第2、6、8条新闻聚成一类,其余为一类!
聚类后的标签为: [0 1 0 2 0 1 2 1]
7 绘制谱系图
7.1 词频矩阵的文本聚类谱系图
HiCluster_none = AgglomerativeClustering(distance_threshold=0,n_clusters=None).fit(matrix)
def plot_dendrogram(model, **kwargs):
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# 绘制谱系图
dendrogram(linkage_matrix, **kwargs)
plot_dendrogram(HiCluster_none, truncate_mode=None)
plt.show()
其中,谱系图横坐标的索引从0开始,即0对应第1条新闻,1对应第2条新闻,....
7.2 tf_idf词频矩阵的文本聚类谱系图
HiCluster_none = AgglomerativeClustering(distance_threshold=0,n_clusters=None).fit(matrix_tf_idf)
def plot_dendrogram(model, **kwargs):
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# 绘制谱系图
dendrogram(linkage_matrix, **kwargs)
plot_dendrogram(HiCluster_none, truncate_mode=None)
plt.show()