文本分析--基于gensim的文本主题模型分析
#!/usr/bin/python# -*- coding:utf8 -*-import osimport timeimport reimport jieba.analyseimport time# 关键词获取def post_cut():fr = open("post_data.txt")# 源文件fo = open("post_key.txt", "a+")
·
#!/usr/bin/python
# -*- coding:utf8 -*-
import os
import time
import re
import jieba.analyse
import time
# 关键词获取
def post_cut():
fr = open("post_data.txt") # 源文件
fo = open("post_key.txt", "a+") # 保存关键词
for line in fr.readlines():
term = line.strip().split("\t")
if len(term) == 3 and term[2] != "":
key_list = jieba.analyse.extract_tags(term[2], 30) # 直接使用jieba获取关键词,按照权重来排序
ustr = term[0] + "\t"
for i in key_list:
ustr += i.encode("utf-8") + " "
fo.write(ustr + "\n")
fr.close()
fo.close()
# tfidf权值
def post_tfidf():
from sklearn.feature_extraction.text import HashingVectorizer
fr = open("post_key.txt")
id_list = []
data_list = []
for line in fr.readlines():
term = line.strip().split("\t")
if len(term) == 2:
id_list.append(term[0])
data_list.append(term[1])
hv = HashingVectorizer(n_features=10000, non_negative=True) # 该类实现hash技巧
post_tfidf = hv.fit_transform(data_list) # return feature vector 'fea_train' [n_samples,n_features]
print 'Size of fea_train:' + repr(post_tfidf.shape)
print post_tfidf.nnz
post_cluster(id_list, post_tfidf)
# 聚类
def post_cluster(id, tfidf_vec):
from sklearn.cluster import KMeans
kmean = KMeans(n_clusters=300) # 聚成300个类别
print "kmeans"
kmean.fit(tfidf_vec)
count1 = 0
count2 = 0
pred = kmean.predict(tfidf_vec)
fo = open("cluster.txt", "a+") # 写入聚类结果
for i in range(len(pred)):
count2 += 1
fo.write(id[i] + "\t" + str(pred[i]) + "\n")
fo.close()
print "%d+%d" % (count1, count2)
# lda聚类
def post_lda(cluster):
from gensim import corpora, models, matutils
count = 0
fr = open("post_key.txt")
fo2 = open("post_vec_lda.txt", "a+") # 写入
id_list = []
data_list = []
for line in fr.readlines():
term = line.strip().split("\t")
if len(term) == 2:
count += 1
id_list.append(term[0])
word = term[1].strip().split()
data_list.append(word)
print "lda"
dic = corpora.Dictionary(data_list) # 构造词典
corpus = [dic.doc2bow(text) for text in data_list] # 每个text 对应的稀疏向量
tfidf = models.TfidfModel(corpus) # 统计tfidf
print "lda"
corpus_tfidf = tfidf[corpus] # 得到每个文本的tfidf向量,稀疏矩阵
lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=200)
corpus_lda = lda[corpus_tfidf] # 每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重
print "lda"
num = 0
for doc in corpus_lda:
wstr = ""
for i in range(len(doc)):
item = doc[i]
wstr += str(item[0]) + "," + str(item[1])[0:7] + "/"
fo2.write(id_list[num] + "\t" + wstr[0:-1] + "\n")
num += 1
fr.close()
fo2.close()
print num
if cluster:
lda_csc_matrix = matutils.corpus2csc(corpus_lda).transpose() # gensim sparse matrix to scipy sparse matrix
post_cluster(id_list, lda_csc_matrix)
if __name__ == "__main__":
# url = "path"
time = time.time()
post_cut()
post_tfidf()
lda_cluster = False
post_lda(lda_cluster)
print time.time() - time
点击阅读全文
更多推荐
所有评论(0)