1 问题描述
现在有8条新闻的文本文件,这里采用文本分析方法,对8条新闻进行分词,计算词的邻接矩阵,并绘制主要词频的网络图。8个文档存放在Python工作目录下。
2 导入所需模块
#!/usr/bin/env python
# -*- coding: cp936 -*- 使用中文
import pandas as pd
import jieba.posseg as jp,jieba
from sklearn.feature_extraction.text import CountVectorizer
import os
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
3 读取新闻文本文档
documents = []
dirPath='./news'
fileList = os.listdir(dirPath)
for f in fileList:
f = dirPath+'/'+f
print(f)
news = open(f,'r+',encoding = 'gbk').read()
documents.append(news)
可以看到,8个新闻文档的路径如下:
4 对新闻文档进行分词、去停用词、去除非名词
# 停用词表
stopwords = [line.strip() for line in open('百度停词表.txt', 'r', encoding='utf-8').readlines()]
flags = ('n', 'nr', 'ns', 'nt') #名词词性
doc_words = []
for document in documents:
# 采用jieba进行分词、词性筛选、去停用词
doc=""
for word in jp.cut(document.strip()):
if word.flag in flags and word.word not in stopwords and word.word not in ["—","\n"," "," ","]","["]:
doc=doc+" "+word.word
doc_words.append(doc)
5 生成词频矩阵
count_vectorizer = CountVectorizer(min_df=3)
cv = count_vectorizer.fit_transform(doc_words)
feature_names = count_vectorizer.get_feature_names()
matrix = cv.toarray()
df_cv = pd.DataFrame(matrix,columns=feature_names)
# 只保留出现次数最多的15个词
num=list(np.argsort(-df_cv.apply(lambda x:x.sum())))
df_cv=df_cv.iloc[:,num[0:15]]
print(df_cv)
6 计算词与词之间的邻接矩阵
mt = np.array(df_cv)
names = df_cv.columns.values
df_mt=pd.DataFrame(np.dot(mt.T,mt),columns=names,index=names)
print(df_mt)
7 绘制网络图
plt.rcParams['font.sans-serif']=['SimHei'] #设置字体为SimHei显示中文
plt.rcParams['axes.unicode_minus']=False #设置正常显示字符
netf=nx.from_pandas_adjacency(df_mt)
nx.draw(netf,with_labels=True,node_color='yellow')
plt.show()