from tkinter import *  # 导入窗口控件
import tkinter.filedialog
import requests
from lxml import etree
from tkinter import ttk
from bs4 import BeautifulSoup
import webbrowser  # 调用浏览器打开网页
from tkinter import messagebox  # 弹出提示框
from openpyxl import Workbook
import openpyxl
import time  # 延时
import random   #随机
import datetime  #调用时间

'''
版本:V3.0
语言环境:python 3.8
        pycharm 2020.2

'''

treedata1 = []  # 全局变量用于存储查询到企业详细信息数据
treedata_jianhuaguanli=[]  # 全局变量用于存储查询到简化管理企业详细信息数据
lerror2=[]  #保存错误页面信息
global jishuleijia  # 全局变量用于存储查询企业,进度条用数据
jishuleijia = 0

# 得到总页数_开始
def kaishipaqu_begin():
    if treedata1 or treedata_jianhuaguanli:
        messagebox.showinfo("提示", '程序正在运行中,请稍候........')
    starttime = datetime.datetime.now()  # 记录程序开始时间
    datas = {"page.pageNo": "1",
             "page.orderBy": "",
             "page.order": "",
             "province": "",
             "city": "",
             "registerentername": "",
             "xkznum": "",
             "treadname": "",
             "treadcode": "",
             "publishtime": ""}

    headers = {"Accept": "text/html,application/xhtml+xml,application/xml;", "Accept-Encoding": "gzip",
               "Accept-Language": "zh-CN,zh;q=0.8",
               "Cache-Control": "no-cache",
               "Connection": "keep-alive",
               "Content-Length": "141",
               "Content-Type": "application/x-www-form-urlencoded",
               # "Cookie": "JSESSIONID=8B40D3C75600CE7920700553EF4526AC; __guid=120853001.4108463626362789000.1591195329366.5571; _gscu_1697192173=06743647pn47tb15; viewsid=321fe86328084f7ca61707b7e3864ee5; Hm_lvt_0f50400dd25408cef4f1afb556ccb34f=1606743649; paiwu80_cookie=37836164; JSESSIONID9002C=CDDDEEB25180581FFDBC32071E89002C; es.echatsoft.com_12555_encryptVID=rCEQ7DECIUK4Rh6UyHgHmQ%3D%3D; es.echatsoft.com_12555_chatVisitorId=885531424; echat_firsturl=http%3A%2F%2Fpermit.mee.gov.cn%2FpermitExt%2Fdefaults%2Fdefault-index!getInformation.action; echat_firsttitle=%E5%85%A8%E5%9B%BD%E6%8E%92%E6%B1%A1%E8%AE%B8%E5%8F%AF%E8%AF%81%E7%AE%A1%E7%90%86%E4%BF%A1%E6%81%AF%E5%B9%B3%E5%8F%B0-%E5%85%AC%E5%BC%80%E7%AB%AF; echat_referrer_timer=echat_referrer_timeout; echat_referrer=http%3A%2F%2Fpermit.mee.gov.cn%2FpermitExt%2Foutside%2Fdefault.jsp; echat_referrer_pre=; monitor_count=9",
               "DNT": "1",
               "Host": "permit.mee.gov.cn",
               "Origin": "http://permit.mee.gov.cn",
               "Pragma": "no-cache",
               "Referer": "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action",
               "Upgrade-Insecure-Requests": "1",
               "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
               }

    url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
    r = requests.post(url, headers=headers, data=datas)
    html = etree.HTML(r.text)
    # print(r.text)
    # input("22332332")
    urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5]  # 找到HTML中总页数
    # print(urlpage)     得到字符串:javascript:jumpPage2(60)
    zonyeshu = int(re.sub("\D", "", urlpage)[1:])  ## 截取字符串,得到总页数
    print(zonyeshu)

    # 得到总页数_结束
    # 得到企业总个数_开始
    datas = {"page.pageNo": zonyeshu,
             "page.orderBy": "",
             "page.order": "",
             "province": "",
             "city": "",
             "registerentername": "",
             "xkznum": "",
             "treadname": "",
             "treadcode": "",
             "publishtime": ""}
    url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
    r = requests.post(url, headers=headers, data=datas)
    html = etree.HTML(r.text)
    href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
    # print(len(href_url))    #得到最后一页,有几个数据
    zuihouyiye = int(len(href_url))
    qiyezongshu = (zonyeshu - 1) * 10 + zuihouyiye  # 总页数减最后一页,再加最后一页企业数,得到发放排污许可证企业数量
    if not qiyezongshu:  # 判断IP是否被封,如果被封程序暂停。
       input("IP错误,请更换!!!!!")
    print(qiyezongshu)
    yijinghefaqiyeshu = '已核发排放许可证:', qiyezongshu, '个.'
    # 得到企业总个数_结束
    # messagebox.showinfo("提示,企业总页数", zonyeshu )
    # zonyeshu2=2  #调试读取页数设置
    zonyeshu2 = zonyeshu + 1  # 因为变量i,是从零加在累加,如果不加1,到最后倒数第二页就截止爬取,最后一页没有数据。
    # print(type(zonyeshu2))
    print(zonyeshu2)
    for i in range(1, zonyeshu2):
        time_random = [6,10,5,8,7,13, 9]  # 设置随机延迟访问,防止封IP     为秒数,自己可适当调整。
        time_test = random.choice(time_random)
        yanshi=time_test
        print('延迟时间', time_test)
        for aa in range(yanshi):
            time.sleep(1)
            l2["text"] = '延迟时间:' + str(yanshi) + " / " + str(aa) + " 开始时间:" + starttime.strftime('%H:%M')
            l2.update()

        # time.sleep(time_test)
        # welcome(time_test)

        datas = {"page.pageNo": i,
                  "page.orderBy": "",
                  "page.order": "",
                  "province": "",
                  "city": "",
                   "registerentername": "",
                  "xkznum": "",
                  "treadname": "",
                  "treadcode": "",
                  "publishtime": ""}
        # s = requests.session()#开启session保持状态
        url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
        r = requests.post(url, headers=headers, data=datas)
        html = etree.HTML(r.text)
         # messagebox.showinfo("提示" )
        paiwuxuke_sheng = html.xpath('//table[@class="tabtd"]/tr/td[1]/text()')[1:]  # 得到排污许可证省
        paiwuxuke_dishi = html.xpath('//table[@class="tabtd"]/tr/td[2]/text()')[1:]  # 得到排污许可证地市
        paiwuxuke_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')  # 得到排污许可证详细页面链接
        paiwuxuke_id = html.xpath('//table[@class="tabtd"]/tr/td[@class="font-green"]/text()')  # 得到排污许可证号码
        paiwuxuke_name = html.xpath(
            '//table[@class="tabtd"]/tr/td[@style="text-align: left;padding-left: 5px;"]/text()')  # 得到企业名称
        paiwuxuke_shenpishijian = html.xpath('//table[@class="tabtd"]/tr/td[7]/text()')[1:]  # 得到排污许可证审批时间
        paiwuxuke_hangye = html.xpath('//table[@class="tabtd"]/tr/td[5]/text()')[1:]  # 得到排污许可证行业
        paiwuxuke_youxiaoqi = html.xpath('//table[@class="tabtd"]/tr/td[6]/text()')[1:]  # 得到排污许可证审批时间

        if not paiwuxuke_url:  # 判断IP是否被封,如果被封程序暂停。
            input("IP错误,请更换!!!!!")
        z1 = len(paiwuxuke_id)
        z2 = len(paiwuxuke_name)
        z3 = len(paiwuxuke_url)
        z4 = len(paiwuxuke_youxiaoqi)
        z5 = len(paiwuxuke_hangye)
        z6 = len(paiwuxuke_shenpishijian)
        if not z1 == z2 == z3 == z4 == z5 == z6:
            print("发现有数据不一致的地方")
            print('发现错误页面:' + str(i))
            lerror2.append([i])
            lerror["text"] = '程序发现错误页面:' + str(i)
            lerror.update()
            print(lerror2)
            continue

        # print(paiwuxuke_id)
        # print(paiwuxuke_name)
        # print(paiwuxuke_url)
        # print(paiwuxuke_youxiaoqi )
        # print(paiwuxuke_hangye)
        # print(paiwuxuke_shenpishijian)
        # messagebox.showinfo("总页数", zonyeshu2)
        # treedata1.append([paiwuxuke_id,paiwuxuke_name,paiwuxuke_hangye,paiwuxuke_youxiaoqi,paiwuxuke_shenpishijian,paiwuxuke_url])                      #全局变量中存储查询到企业的详细信息
        z = len(paiwuxuke_name)
        for i in range(z):
            global jishuleijia
            tree2.insert("", '0', jishuleijia, text="", values=(
            jishuleijia, paiwuxuke_sheng[i],paiwuxuke_dishi[i],paiwuxuke_id[i], paiwuxuke_name[i], paiwuxuke_hangye[i], paiwuxuke_youxiaoqi[i],
            paiwuxuke_shenpishijian[i], paiwuxuke_url[i]))  # 在TREE列表中显示查询到企业的详细信息
            # time.sleep(1)  # 设计延时2秒
            treedata_jianhuaguanli.append(
                [jishuleijia,  paiwuxuke_sheng[i],paiwuxuke_dishi[i],paiwuxuke_id[i], paiwuxuke_name[i], paiwuxuke_hangye[i], paiwuxuke_youxiaoqi[i],
                 paiwuxuke_shenpishijian[i],
                 paiwuxuke_url[i]])
            jishuleijia = jishuleijia + 1
            print(jishuleijia, "====", qiyezongshu)
            l["text"] = '' + str(jishuleijia) + '/' + str(qiyezongshu)
            l.update()
            tree2.update()
        #input("zhanting....................")

    messagebox.showinfo("提示", "恭喜,所有数据都已准备完毕!请保存excel文件")
    endtime = datetime.datetime.now()
    seconds = (endtime - starttime).seconds
    start = starttime.strftime('%Y-%m-%d %H:%M')
    # 100 秒
    # 分钟
    minutes = seconds // 60
    second = seconds % 60
    print((endtime - starttime))
    timeStr = str(minutes) + '分钟' + str(second) + "秒"
    print("程序从 " + start + ' 开始运行,运行时间为:' + timeStr)
    l2["text"] = '程序共运行时间:' + timeStr
    l2.update()


def tree_click(event):
    if not tree2.item(tree2.selection(), 'values'):  # 判断tree2控件中是否有数据。
        messagebox.showinfo("提示", '现在还没有数据!')
    else:
        item_text = tree2.item(tree2.selection(), 'values')[1]
        messagebox.showinfo("提示", "你所选择的数据是:" + item_text)

    # webbrowser.open_new_tab('http://permit.mee.gov.cn' + item_text)  # 打开链接



def jiayiguanli_save():
    try:
        if treedata_jianhuaguanli:  # 判断是否爬取到数据,是否需要保存excel文件
            # wb = openpyxl.load_workbook('paiwuxukejianyiguanli.xlsx')
            # ws = wb['Sheet1']
            # aa = len(tree.get_children())
            biaoti = [['序号',  '省/直辖市', '地市','许可证编号', '企业名称', '行业类别', '有效期限', '登记时间', '详细链接']]
            wb = Workbook()
            wb1 = wb.create_sheet('index', 0)
            wb1.title = '管理数据'
            filename =  tkinter.filedialog.asksaveasfilename(filetypes=[('xlsx', '*.xlsx')], initialdir='D:\\')
            filename = filename + '.xlsx'
            for row2 in range(len(biaoti)):
                wb1.append(biaoti[row2])
            for row in range(len(treedata_jianhuaguanli)):
                wb1.append(treedata_jianhuaguanli[row])
            #wb.save("paiwuxukejianyiguanli.xlsx")
            wb.save(filename)
            messagebox.showinfo("提示", "paiwuxukejianyiguanli.xlsx保存完毕~!!!")
        else:
            messagebox.showinfo("提示", '没有数据,不必保存')
    except:
        messagebox.showinfo("提示", '保存文件错误,请重试~!!')




root = Tk()  # 创建窗口
root.title("排污许可证数据信息")
root.geometry("900x750+500+50")  # 小写x代表乘号500x400为窗口大小,+0+0窗口显示位置
lbxianshixinxi = LabelFrame(root, width=800, text='', padx=80, pady=10)#, labelanchor=W
lbxianshixinxi.grid(row=0, column=0)
l = Label(lbxianshixinxi, text='0/0', width=20)  # 创建标签控件
l.grid(row=0, column=0,padx=10, pady=10,sticky=W)
l2 = Label(lbxianshixinxi, text='程序运行时间:', justify=RIGHT)  # 创建标签控件
l2.grid(row=0, column=1,padx=10, pady=10,sticky=W)
lerror = Label(lbxianshixinxi, text='', width=25, fg='red',justify=RIGHT)  # 创建标签控件
lerror.grid(row=0, column=2,padx=10, pady=10,sticky=W)
lbtree = LabelFrame(root, width=500, height=10, text='数据显示区域', padx=8, pady=10)
lbtree.grid(row=1, column=0)
lb4 = LabelFrame(root, width=800, height=500, text='重点管理排污许可证数据', padx=8, pady=8, foreground='red')
lb4.grid(row=8, column=0)
button = Button(lb4, text=" 开始爬取(重点管理) ", command=kaishipaqu_begin)  # 创建按钮控件
button.grid(row=5, column=2, padx=20, pady=0,sticky=W)
button2 = Button(lb4, text=" 保存列表信息数据 ", command=jiayiguanli_save)  # 创建按钮控件
button2.grid(row=5, column=4, sticky=W)
button1 = Button(lb4, text=" 退   出 ", command=root.quit)  # 创建按钮控件
button1.grid(row=5, column=5, padx=20, pady=0,sticky=N)
tree2 = ttk.Treeview(lbtree, height=20, show="headings")  # 表格第一列不显示
scroll2_ty = Scrollbar(root, orient=VERTICAL, command=tree2.yview)  # 添加滚动条
scroll2_ty.grid(row=1, column=2, sticky=N + S)
tree2['yscrollcommand'] = scroll2_ty.set
scroll2_tx = Scrollbar(root, orient=HORIZONTAL, command=tree2.xview)
scroll2_tx.grid(row=3, column=0, sticky=E + W)
tree2['xscrollcommand'] = scroll2_tx.set
tree2.grid(row=1, columnspan=1)
tree2["columns"] = ('序号',  '省/直辖市', '地市','许可证编号', '企业名称', '行业类别', '有效期限', '登记时间', '详细链接')
# 设置列,不显示
tree2.column("序号", width=50)
tree2.column("省/直辖市", width=100)
tree2.column("地市", width=100)
tree2.column("许可证编号", width=100)
tree2.column("企业名称", width=100)
tree2.column("行业类别", width=100)
tree2.column("有效期限", width=100)
tree2.column("登记时间", width=100)
tree2.column("详细链接", width=100)
# 显示表头
tree2.heading("序号", text="序号")
tree2.heading("省/直辖市", text="省/直辖市")
tree2.heading("地市", text="地市")
tree2.heading("许可证编号", text="许可证编号")
tree2.heading("企业名称", text="企业名称")
tree2.heading("行业类别", text="行业类别")
tree2.heading("有效期限", text="有效期限")
tree2.heading("登记时间", text="登记时间")
tree2.heading("详细链接", text="详细链接")

tree2.bind('<ButtonRelease>', tree_click)  # 列表框绑定鼠标事件函数

root.mainloop()  # 显示窗口  mainloop 消息循环

Logo

瓜分20万奖金 获得内推名额 丰厚实物奖励 易参与易上手

更多推荐