From dd8a293ead2411a71b2560a337ca50564169a15d Mon Sep 17 00:00:00 2001 From: Jimmy Xiang Date: Sat, 8 Feb 2020 11:05:04 +0800 Subject: [PATCH] Site updated: 2020-02-08 11:05:04 --- article/python-nlp-01/index.html | 40 +++----------------------------- 1 file changed, 3 insertions(+), 37 deletions(-) diff --git a/article/python-nlp-01/index.html b/article/python-nlp-01/index.html index d30537e..864332d 100644 --- a/article/python-nlp-01/index.html +++ b/article/python-nlp-01/index.html @@ -115,45 +115,11 @@

2.准备红楼梦文本

文本可以用下面链接下载

https://github.com/flypythoncom/flypython/blob/master/wordcloud_hlm_seg.txt

或者可以自己写代码,对文本进行清洗,分词。
这里需要安装jieba分词,pip install jieba

-
import jieba
-import re
+
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17

import jieba
import re

special_character_removal = re.compile(r'[,。、【 】“”:;()《》‘’{}?!⑦%>℃.^-——=&#@¥『』]', re.IGNORECASE)

fw=open("hlm_seg.txt","w",encoding="utf-8")

with open('hlm.txt',encoding="utf-8") as fp:
for line in fp:
l = special_character_removal.sub('', line.strip())
words=jieba.cut(l)
t=" ".join(words)
fw.write(t)
fw.write("\n")

fw.close()
-special_character_removal = re.compile(r'[,。、【 】“”:;()《》‘’{}?!⑦%>℃.^-——=&#@¥『』]', re.IGNORECASE) +

3. 编写词云python代码并运行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

from os import path
from wordcloud import WordCloud

d = path.dirname(__file__)
# Read the whole text.
text = open(path.join(d, 'hlm_seg.txt'),encoding="utf-8").read()
# Generate a word cloud image
# font=path.join(d, "simkai.ttf")
font='C:/Windows/Fonts/simkai.ttf'
wordcloud = WordCloud(font_path=font,#设置中文字体,不指定就会出现中文不显示
width=1024,#宽
height=840,#高
background_color='white',#设置背景色
# max_words=100,#最大词汇数
# max_font_size=100#最大号字体
).generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt

plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
-fw=open("hlm_seg.txt","w",encoding="utf-8") - -with open('hlm.txt',encoding="utf-8") as fp: - for line in fp: - l = special_character_removal.sub('', line.strip()) - words=jieba.cut(l) - t=" ".join(words) - fw.write(t) - fw.write("\n") -fw.close()

3. 编写词云python代码并运行

from os import path  
-from wordcloud import WordCloud
-
-d = path.dirname(__file__)  
-# Read the whole text.  
-text = open(path.join(d, 'hlm_seg.txt'),encoding="utf-8").read()  
-# Generate a word cloud image  
-# font=path.join(d, "simkai.ttf")  
-font='C:/Windows/Fonts/simkai.ttf'  
-wordcloud = WordCloud(font_path=font,#设置中文字体,不指定就会出现中文不显示  
-  width=1024,#宽  
-  height=840,#高  
-  background_color='white',#设置背景色   
-  # max_words=100,#最大词汇数  
-  # max_font_size=100#最大号字体  
-  ).generate(text)  
-
-# Display the generated image:  
-# the matplotlib way:  
-import matplotlib.pyplot as plt  
-
-plt.figure()  
-plt.imshow(wordcloud)  
-plt.axis("off")  
-plt.show()

结果:

+

结果:

词云运行结果

后台回复“词云”获得完整运行代码

人生苦短,我用python早下班。如果觉得不错,对你工作中有帮助,请加我微信公众号flypython,我们一起探讨python相关问题