1、下载维基百科data

https://dumps.wikimedia.org/zhwiki/20190820/
zhwiki-20190820-pages-articles.xml.bz2
也可以下下面的小的
在这里插入图片描述

2、将bz2内容提取出来

确保安装了gensim pip install gensim

#!/usr/bin/env python
# -*- coding: utf-8  -*-
#将xml的wiki数据转换为text格式

import logging
import os.path
import sys

from gensim.corpora import WikiCorpus

if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])#得到文件名
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    if len(sys.argv) < 3:
        #print globals()['__doc__'] % locals()
        sys.exit(1)

    inp, outp = sys.argv[1:3]
    space = " "
    i = 0

    output = open(outp, 'w',encoding='utf-8')
    wiki =WikiCorpus(inp, lemmatize=False, dictionary=[])#gensim里的维基百科处理类WikiCorpus
    for text in wiki.get_texts():#通过get_texts将维基里的每篇文章转换位1行text文本,并且去掉了标点符号等内容
        output.write(space.join(text) + "\n")
        i = i+1
        if (i % 10000 == 0):
            logger.info("Saved "+str(i)+" articles.")

    output.close()
    logger.info("Finished Saved "+str(i)+" articles.")
    #python process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.txt

进入该目录输入(重命名一下确保名字是一样的)
python process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.txt
得到wiki.zh.txt
可以用test测试一下(文件比较大,直接打开比较慢)

import codecs,sys

f = codecs.open('wiki.zh.txt','r',encoding="utf-8")
line = f.readline()
print(line)
#python test.py

3、将繁体转为简体

默认为繁体字,转为简体字自己找一个opencc1.0.1吧
链接:https://pan.baidu.com/s/1e_qPxWzAZS74s4aIEEQWSA
提取码:by1b
解压后将wiki.zh.txt放进去,进入该目录
cd /d X:\opencc-1.0.1-win64
输入
opencc -i wiki.zh.txt -o wiki.zh.simp.txt -c t2s.json
可以得到简体中文版wiki.zh.simp.txt可以使用test.py检验一下。

4、分词

使用jieba分词器pip install jieba
进入wiki.zh.simp.txt所在目录

#jieba.py
import jieba
import jieba.analyse
import jieba.posseg as pseg
import codecs,sys

def cut_words(sentence):
    return " ".join(jieba.cut(sentence)).encode('utf-8')
f = codecs.open('wiki.zh.simp.txt','r',encoding='utf-8')
target = codecs.open('wiki.zh.simp.seg.txt','w',encoding='utf-8')
print('open files')
line_num = 1
line = f.readline()
while line:
    print('----processing',line_num,'article--------------------')
    line_seg = " ".join(jieba.cut(line))
    target.writelines(line_seg)
    line_num = line_num+1
    line = f.readline()
f.close()
target.close()
exit()
#python jieba.py

得到分好词的wiki.zh.simp.seg.txt

5、生成Model

import logging
import multiprocessing
import os.path
import sys
 
from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences
 
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)z
    logger.info("running %s" % ' '.join(sys.argv))
    #check and process input arguments
    if len(sys.argv) < 4:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    input_dir, outp1, outp2 = sys.argv[1:4]
 
    model = Word2Vec(PathLineSentences(input_dir),
                     size=256, window=10, min_count=5,
                     workers=multiprocessing.cpu_count(), iter=10)
    model.save(outp1)
    model.wv.save_word2vec_format(outp2, binary=False)
#python word2vec_model.py wiki.zh.simp.seg.txt wiki.zh.text.model wiki.zh.text.vector

生成4个文件
在这里插入图片描述

6、测试Model

from gensim.models import Word2Vec

en_wiki_word2vec_model = Word2Vec.load('wiki.zh.text.model')

testwords = ['鼠标','编程','杯子','实验室','牛奶']

for i in range(5):
    res = en_wiki_word2vec_model.most_similar(testwords[i])
    print (testwords[i])
    print (res)

在这里插入图片描述
可以得到相关的词以及关联度。

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐