具体实现的功能(如图所示),紧接着上一篇文档
在这里插入图片描述在这里插入图片描述

代码如下

// An highlighted block
import os,jieba,jieba.posseg
from typing import List

Input_path = 'E:\\模式识别大作业\\XML\\'
Output_path = 'E:\\模式识别大作业\\txt\\'


def XML2txt2(load_XML, load_txt):
    xml_filepath = os.path.abspath(load_XML)
    f_XML=open(xml_filepath, mode='r', encoding='UTF-8')
    words = []
    tn=0
    for i in f_XML.readlines():
        for word in i.strip():
            if '\u4e00' <= word <= '\u9fff':  # if word > chr(127): 此代码是直接过滤出汉字而没有标点符号的
                words.append(word)
                tn=1
        if tn:
            words.append('\n')
            tn=0
    f_XML.close()
    a = ''.join(words)
    f_txt = open(load_txt, 'w', encoding='utf-8')
    f_txt.write(a+'')
    f_txt.close()

def XML2txt():

    # file_names=list(os.walk(Input_path))
    for i in list('ABCDEFGHJKLMNPR'):
        load_XML = Input_path + 'LCMC_' + i + '.xml'
        load_txt = Output_path + 'LCMC_' + i + '.txt'
        XML2txt2(load_XML, load_txt)

        load_XML = Input_path + 'LCMC_' + i + '.XML'
        load_txt = Output_path + 'LCMC_' + i + '.TXT'
        XML2txt2(load_XML, load_txt)
def HeBingWemBem():
    with open(Output_path + 'Data_All.TXT', 'w+') as fo:
        for i in list('ABCDEFGHJKLMNPR'):
            with open(os.path.join(Output_path, 'LCMC_' + i + '.txt'), 'r') as fi:
                l = fi.readlines()
                fo.writelines(str(l))
def ShengChengCiKu():
    words=set()
    f_Dict=open(Output_path + 'Words_Dict.TXT','w+')
    with open(Output_path + 'Data_All.TXT', 'r') as f:
        Lib=f.read()

        jieba.enable_parallel(10)
        for x in jieba.cut(Lib):
            CiXing=jieba.posseg.cut(x)
            if len(x) > 1 :
                words.add(x)
        jieba.disable_parallel()


    f_Dict.write(str(words))
    f_Dict.close()


if __name__ == '__main__':
    XML2txt()
    HeBingWemBem()
    ShengChengCiKu()
    print()

Logo

CSDN联合极客时间,共同打造面向开发者的精品内容学习社区,助力成长!

更多推荐