1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
|
import os
import re
import glob
import itertools
import jieba
import pandas as pd
import networkx as nx
from collections import defaultdict
from gensim import corpora
from gensim.models import LdaModel
# 全局配置参数
DOCS_GLOB = r'D:\\PyTorch_practice\\博客分析\\数据\\原始博客\\*.md' # 文档路径通配符
STOPWORDS_PATH = r'D:\\PyTorch_practice\\博客分析\\数据\\设置\\分词stop.txt' # 可选停用词文件
OUTPUT_DIR = r'D:\\PyTorch_practice\\博客分析\\数据\\结果输出\\' # 统一输出文件夹
NO_BELOW = 5 # 词频低于此值的词将被过滤
NO_ABOVE = 0.5 # 词频高于此比例的词将被过滤
KEEP_N = 100000 # 保留的最高频词数量
NUM_TOPICS = 10 # 主题数量
TOPN = 20 # 每个主题显示的关键词数量
MAX_EDGES = 200 # 词共现图中保留的最大边数
# 默认停用词集合
DEFAULT_STOPWORDS = set([
'的', '了', '和', '是', '在', '就', '都', '而', '及', '与', '或', '一个', '我', '你', '他', '她', '它', '我们', '你们',
'他们', '她们', '这', '那', '其', '又', '被', '上', '中', '对', '所', '为', '于'
])
def load_stopwords(path):
"""加载停用词列表"""
sw = set()
if os.path.exists(path):
try:
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
for line in f:
w = line.strip()
if w:
sw.add(w)
except Exception as e:
print(f'读取停用词文件失败 {path}: {e}')
# 如果自定义停用词为空,则使用默认停用词
return sw if sw else DEFAULT_STOPWORDS
def load_documents(glob_pattern):
"""加载文档集合"""
docs = []
filenames = []
# 获取匹配的文件列表
file_list = sorted(glob.glob(glob_pattern))
if not file_list:
print(f'未找到匹配的文件: {glob_pattern}')
return filenames, docs
print(f'找到 {len(file_list)} 个文件')
# 读取每个文件内容
for fp in file_list:
try:
with open(fp, 'r', encoding='utf-8', errors='ignore') as f:
text = f.read().strip()
if text:
docs.append(text)
filenames.append(os.path.basename(fp))
except Exception as e:
print(f'读取文件失败 {fp}: {e}')
return filenames, docs
# 文本清洗与分词
RE_CLEAN = re.compile(r"[\s\d\u0000-\u007F]+") # 去掉 ascii/数字/多余空白,保留中文汉字和中文标点
def preprocess(text, stopwords):
"""文本预处理:清洗、分词、过滤"""
text = RE_CLEAN.sub(' ', text)
tokens = jieba.lcut(text)
# 过滤停用词和单字词
tokens = [t for t in tokens if t.strip() and t not in stopwords and len(t) > 1]
return tokens
def build_cooccurrence(tokens_list):
"""构建词共现矩阵"""
cooc = defaultdict(int)
freq = defaultdict(int)
for tokens in tokens_list:
# 对每篇文档中的词去重
unique_tokens = list(dict.fromkeys(tokens))
# 更新词频
for w in unique_tokens:
freq[w] += 1
# 构建共现关系
for a, b in itertools.combinations(unique_tokens, 2):
if a != b:
# 确保有序,避免重复计数
key = (a, b) if a < b else (b, a)
cooc[key] += 1
return freq, cooc
def main():
"""主函数"""
# 创建输出目录
os.makedirs(OUTPUT_DIR, exist_ok=True)
print('加载停用词...')
stopwords = load_stopwords(STOPWORDS_PATH)
print(f'加载了 {len(stopwords)} 个停用词')
print('加载文档...')
filenames, docs = load_documents(DOCS_GLOB)
print(f'找到 {len(docs)} 篇文档')
if len(docs) == 0:
print('未找到任何文档,检查 DOCS_GLOB 设置')
return
print('分词并预处理...(这一步可能需要一些时间)')
texts = [preprocess(d, stopwords) for d in docs]
# 统计处理后的词汇信息
total_tokens = sum(len(text) for text in texts)
print(f'预处理完成,共处理 {total_tokens} 个词汇')
# 构建字典与语料
print('构建词典与语料...')
dictionary = corpora.Dictionary(texts)
original_size = len(dictionary)
dictionary.filter_extremes(no_below=NO_BELOW, no_above=NO_ABOVE, keep_n=KEEP_N)
filtered_size = len(dictionary)
print(f'词典过滤: {original_size} -> {filtered_size} 个词')
corpus = [dictionary.doc2bow(text) for text in texts]
# 训练LDA模型
print('训练 LDA 模型...')
lda = LdaModel(
corpus=corpus,
id2word=dictionary,
num_topics=NUM_TOPICS,
passes=15,
random_state=42,
per_word_topics=True
)
# 保存模型
model_path = os.path.join(OUTPUT_DIR, 'lda_model.model')
lda.save(model_path)
print(f'LDA 模型已保存: {model_path}')
# 输出主题-词分布
topics = lda.show_topics(num_topics=NUM_TOPICS, num_words=TOPN, formatted=False)
print('\n主题关键词(每行一个主题):')
rows_kw = []
for tid, terms in topics:
topic_words = ', '.join([f'{w}({p:.4f})' for w, p in terms])
print(f'Topic {tid}: {topic_words}')
for rank, (word, prob) in enumerate(terms, start=1):
rows_kw.append({
'topic': tid,
'rank': rank,
'word': word,
'weight': float(prob)
})
# 保存主题关键词
kw_path = os.path.join(OUTPUT_DIR, 'topic_keywords.csv')
df_kw = pd.DataFrame(rows_kw)
df_kw.to_csv(kw_path, index=False, encoding='utf-8-sig')
print(f'主题关键词已保存: {kw_path}')
# 导出文档-主题分布
print('导出文档-主题分布...')
rows = []
for doc_id, bow in enumerate(corpus):
doc_topics = lda.get_document_topics(bow, minimum_probability=0.0)
for tid, weight in doc_topics:
rows.append({
'doc': filenames[doc_id],
'doc_id': doc_id,
'topic': int(tid),
'weight': float(weight)
})
dt_path = os.path.join(OUTPUT_DIR, 'doc_topic.csv')
df_dt = pd.DataFrame(rows)
df_dt.to_csv(dt_path, index=False, encoding='utf-8-sig')
print(f'文档-主题分布已保存: {dt_path}')
# 构建 topic-term 二部图
print('构建 topic-term 二部图...')
G = nx.Graph()
for tid, terms in topics:
# 使用前3个关键词作为主题名称
top_words = [w for w, _ in terms[:3]]
topic_label = f"Topic_{tid}_" + "_".join(top_words)
G.add_node(f'topic_{tid}', label=topic_label, type='topic', topic_id=tid)
for tid, terms in topics:
for term, prob in terms:
if not G.has_node(term):
G.add_node(term, label=term, type='term')
weight = float(prob)
G.add_edge(f'topic_{tid}', term, weight=weight)
# 保存二部图
bipartite_path = os.path.join(OUTPUT_DIR, 'gephi_topic_term.gexf')
nx.write_gexf(G, bipartite_path)
print(f'Topic-Term 二部图已保存: {bipartite_path}')
# 构建词共现图
print('构建词共现图...')
freq, cooc = build_cooccurrence(texts)
# 按权重排序边,只保留权重最高的MAX_EDGES条边
sorted_edges = sorted(cooc.items(), key=lambda x: x[1], reverse=True)
# 收集所有需要保留的节点(出现在前MAX_EDGES条边中的节点)
nodes_to_keep = set()
edges_to_keep = []
for (a, b), w in sorted_edges:
if len(edges_to_keep) >= MAX_EDGES:
break
nodes_to_keep.add(a)
nodes_to_keep.add(b)
edges_to_keep.append(((a, b), w))
# 创建图,只添加需要保留的节点和边
H = nx.Graph()
# 添加节点(只保留出现在边中的节点)
for node in nodes_to_keep:
if node in freq:
H.add_node(node, label=node, frequency=int(freq[node]), type='term')
# 添加边
for (a, b), w in edges_to_keep:
if a in H and b in H: # 确保两个节点都存在
H.add_edge(a, b, weight=int(w))
# 移除孤立节点(如果有的话)
H.remove_nodes_from(list(nx.isolates(H)))
print(f'词共现图包含 {H.number_of_nodes()} 个节点和 {H.number_of_edges()} 条边')
# 保存共现图
cooccurrence_path = os.path.join(OUTPUT_DIR, 'gephi_term_cooccurrence.gexf')
nx.write_gexf(H, cooccurrence_path)
print(f'词共现图已保存: {cooccurrence_path}')
# 输出总结信息
print('\n全部完成。生成的文件:')
output_files = [
'lda_model.model',
'topic_keywords.csv',
'doc_topic.csv',
'gephi_topic_term.gexf',
'gephi_term_cooccurrence.gexf'
]
for fn in output_files:
file_path = os.path.join(OUTPUT_DIR, fn)
if os.path.exists(file_path):
file_size = os.path.getsize(file_path)
print(f' - {file_path} ({file_size/1024:.1f} KB)')
else:
print(f' - {file_path} (文件未生成)')
print('\nGephi 可视化建议:')
print('1. 打开 Gephi:File → Open → 选择 .gexf 文件')
print('2. 使用 Layout(例如 ForceAtlas2)进行布局')
print('3. 根据 node attribute 的 type 上色/筛选')
print('4. 根据 degree/weight 调整节点大小')
if __name__ == '__main__':
main()
|