跳转至

WordCloud 词云生成

WordCloud 是一个用于生成词云图的 Python 库,可以将文本数据可视化为美观的词云图。

安装

pip install wordcloud

中文字体支持需要额外配置字体路径。

基本使用

简单词云

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 生成词云
text = "Python 数据分析 机器学习 深度学习 人工智能"
wordcloud = WordCloud(font_path='simhei.ttf').generate(text)

# 显示图像
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# 保存图像
wordcloud.to_file('wordcloud.png')

从文件读取

with open('text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

wordcloud = WordCloud(
    font_path='simhei.ttf',
    width=800,
    height=400
).generate(text)

自定义样式

基本参数

wordcloud = WordCloud(
    font_path='simhei.ttf',        # 字体路径
    width=800,                      # 宽度
    height=400,                     # 高度
    background_color='white',       # 背景色
    max_words=200,                  # 最大词数
    max_font_size=100,              # 最大字号
    min_font_size=10,               # 最小字号
    colormap='viridis',             # 配色方案
    relative_scaling=0.5,           # 词频重要性
    random_state=42                 # 随机种子
)

配色方案

# 使用 matplotlib 配色
from matplotlib import cm

wordcloud = WordCloud(
    font_path='simhei.ttf',
    colormap='rainbow'  # 'viridis', 'plasma', 'cool', 'hot'
)

高级功能

使用遮罩图片

from PIL import Image
import numpy as np

# 读取遮罩图片
mask = np.array(Image.open('mask.png'))

wordcloud = WordCloud(
    font_path='simhei.ttf',
    mask=mask,                    # 遮罩图片
    contour_width=3,              # 轮廓宽度
    contour_color='steelblue'     # 轮廓颜色
).generate(text)

自定义词频

# 使用字典设置词频
word_freq = {
    'Python': 100,
    '数据分析': 80,
    '机器学习': 60,
    '深度学习': 40
}

wordcloud = WordCloud(font_path='simhei.ttf').generate_from_frequencies(word_freq)

停用词过滤

# 设置停用词
stopwords = set(['的', '了', '在', '是', '我', '有'])

wordcloud = WordCloud(
    font_path='simhei.ttf',
    stopwords=stopwords
).generate(text)

# 从文件加载停用词
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

中文文本处理

使用 jieba 分词

import jieba
from wordcloud import WordCloud

# 读取文本
with open('text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 分词
words = jieba.cut(text)
text_seg = ' '.join(words)

# 生成词云
wordcloud = WordCloud(
    font_path='simhei.ttf',
    background_color='white'
).generate(text_seg)

完整示例

import jieba
import jieba.analyse
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 读取文本
with open('article.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# 加载停用词
with open('stopwords.txt', 'r', encoding='utf-8') as f:
    stopwords = set(f.read().splitlines())

# 分词
words = jieba.cut(text)
filtered_words = [w for w in words if w not in stopwords and len(w) > 1]
text_seg = ' '.join(filtered_words)

# 生成词云
wordcloud = WordCloud(
    font_path='simhei.ttf',
    width=1600,
    height=800,
    background_color='white',
    max_words=100,
    colormap='viridis'
).generate(text_seg)

# 显示
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.savefig('wordcloud.png', dpi=300, bbox_inches='tight')
plt.show()

实用技巧

提取关键词

import jieba.analyse

# TF-IDF 提取关键词
keywords = jieba.analyse.extract_tags(text, topK=50, withWeight=True)
word_freq = {word: weight for word, weight in keywords}

wordcloud = WordCloud(
    font_path='simhei.ttf'
).generate_from_frequencies(word_freq)

多种形状

from PIL import Image
import numpy as np

# 圆形
def create_circle_mask(size=400):
    x, y = np.ogrid[:size, :size]
    mask = (x - size/2) ** 2 + (y - size/2) ** 2 > (size/2) ** 2
    mask = 255 * mask.astype(int)
    return mask

mask = create_circle_mask()
wordcloud = WordCloud(font_path='simhei.ttf', mask=mask).generate(text)

常见问题

中文显示方块

  • 原因: 未指定中文字体
  • 解决: 设置 font_path='simhei.ttf'

常用中文字体路径

  • Windows: C:\Windows\Fonts\simhei.ttf
  • Mac: /System/Library/Fonts/PingFang.ttc
  • Linux: /usr/share/fonts/truetype/

参考资源

  • GitHub: https://github.com/amueller/word_cloud
  • 文档: https://amueller.github.io/word_cloud/