1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
|
import os
import jieba
from wordcloud import WordCloud,ImageColorGenerator
import re
from collections import Counter
import imageio
# 设置停用词,哈工大版 from github & 手工加了些词
stopwords_file = '/Users/joy/Desktop/hit_stopwords.txt'
with open(stopwords_file, 'r', encoding='utf-8') as f:
stopwords = set(f.read().splitlines())
# 遍历指定文件夹下的所有 md 文件
md_folder = '/Users/joy/Documents/hugoblog/content/posts'
text = ''
for root, dirs, files in os.walk(md_folder):
for file in files:
if file.endswith('.md'):
filepath = os.path.join(root, file)
with open(filepath, 'r', encoding='utf-8') as f:
text += f.read()
# 使用 jieba 进行分词
words = jieba.lcut(text)
# 过滤停用词、单个字母数字组合以及长度小于 2 的词
words = [word for word in words if len(word) >= 2 and word.strip() and word not in stopwords and not re.match(r'^[a-zA-Z0-9]+$', word)]
# 计算词频
word_counter = Counter(words)
top_words = word_counter.most_common(100)
print("输入二个字及以上单词的词频:")
for word, count in top_words:
print(word, count)
# 生成遮罩
mask_image = '/Users/joy/Desktop/cat.png'
mask = imageio.imread(mask_image)
# 生成词云,mac 需要把字体定位到系统字体文件夹,不如中文会显示框框
wc1 = WordCloud(font_path="/System/Library/fonts/PingFang.ttc", width=800, height=400, background_color='white',mask=mask).generate_from_frequencies(word_counter)
# 保存词云图像
wc1.to_file('wc1.png')
# 生成遮罩
color_image = '/Users/joy/Desktop/test.png'
color=imageio.imread(color_image)
# 提取颜色
img_colors = ImageColorGenerator(color) #生成词云
wc2 = WordCloud(font_path="/System/Library/fonts/PingFang.ttc", width=800, height=400, background_color='white',mask=color).generate_from_frequencies(word_counter) #修改字体颜色
wc2.recolor(color_func=img_colors) #保存图像
wc2.to_file('wc2.png')
|