全国排污许可证数据
from tkinter import *# 导入窗口控件import tkinter.filedialogimport requestsfrom lxml import etreefrom tkinter import ttkfrom bs4 import BeautifulSoupimport webbrowser# 调用浏览器打开网页from tkinter import messagebo
·
from tkinter import * # 导入窗口控件
import tkinter.filedialog
import requests
from lxml import etree
from tkinter import ttk
from bs4 import BeautifulSoup
import webbrowser # 调用浏览器打开网页
from tkinter import messagebox # 弹出提示框
from openpyxl import Workbook
import openpyxl
import time # 延时
import random #随机
import datetime #调用时间
'''
版本:V3.0
语言环境:python 3.8
pycharm 2020.2
'''
treedata1 = [] # 全局变量用于存储查询到企业详细信息数据
treedata_jianhuaguanli=[] # 全局变量用于存储查询到简化管理企业详细信息数据
lerror2=[] #保存错误页面信息
global jishuleijia # 全局变量用于存储查询企业,进度条用数据
jishuleijia = 0
# 得到总页数_开始
def kaishipaqu_begin():
if treedata1 or treedata_jianhuaguanli:
messagebox.showinfo("提示", '程序正在运行中,请稍候........')
starttime = datetime.datetime.now() # 记录程序开始时间
datas = {"page.pageNo": "1",
"page.orderBy": "",
"page.order": "",
"province": "",
"city": "",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
headers = {"Accept": "text/html,application/xhtml+xml,application/xml;", "Accept-Encoding": "gzip",
"Accept-Language": "zh-CN,zh;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Length": "141",
"Content-Type": "application/x-www-form-urlencoded",
# "Cookie": "JSESSIONID=8B40D3C75600CE7920700553EF4526AC; __guid=120853001.4108463626362789000.1591195329366.5571; _gscu_1697192173=06743647pn47tb15; viewsid=321fe86328084f7ca61707b7e3864ee5; Hm_lvt_0f50400dd25408cef4f1afb556ccb34f=1606743649; paiwu80_cookie=37836164; JSESSIONID9002C=CDDDEEB25180581FFDBC32071E89002C; es.echatsoft.com_12555_encryptVID=rCEQ7DECIUK4Rh6UyHgHmQ%3D%3D; es.echatsoft.com_12555_chatVisitorId=885531424; echat_firsturl=http%3A%2F%2Fpermit.mee.gov.cn%2FpermitExt%2Fdefaults%2Fdefault-index!getInformation.action; echat_firsttitle=%E5%85%A8%E5%9B%BD%E6%8E%92%E6%B1%A1%E8%AE%B8%E5%8F%AF%E8%AF%81%E7%AE%A1%E7%90%86%E4%BF%A1%E6%81%AF%E5%B9%B3%E5%8F%B0-%E5%85%AC%E5%BC%80%E7%AB%AF; echat_referrer_timer=echat_referrer_timeout; echat_referrer=http%3A%2F%2Fpermit.mee.gov.cn%2FpermitExt%2Foutside%2Fdefault.jsp; echat_referrer_pre=; monitor_count=9",
"DNT": "1",
"Host": "permit.mee.gov.cn",
"Origin": "http://permit.mee.gov.cn",
"Pragma": "no-cache",
"Referer": "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
}
url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post(url, headers=headers, data=datas)
html = etree.HTML(r.text)
# print(r.text)
# input("22332332")
urlpage = html.xpath('//div[@class="fr margin-t-33 margin-b-20"]/a/@onclick')[5] # 找到HTML中总页数
# print(urlpage) 得到字符串:javascript:jumpPage2(60)
zonyeshu = int(re.sub("\D", "", urlpage)[1:]) ## 截取字符串,得到总页数
print(zonyeshu)
# 得到总页数_结束
# 得到企业总个数_开始
datas = {"page.pageNo": zonyeshu,
"page.orderBy": "",
"page.order": "",
"province": "",
"city": "",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post(url, headers=headers, data=datas)
html = etree.HTML(r.text)
href_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href')
# print(len(href_url)) #得到最后一页,有几个数据
zuihouyiye = int(len(href_url))
qiyezongshu = (zonyeshu - 1) * 10 + zuihouyiye # 总页数减最后一页,再加最后一页企业数,得到发放排污许可证企业数量
if not qiyezongshu: # 判断IP是否被封,如果被封程序暂停。
input("IP错误,请更换!!!!!")
print(qiyezongshu)
yijinghefaqiyeshu = '已核发排放许可证:', qiyezongshu, '个.'
# 得到企业总个数_结束
# messagebox.showinfo("提示,企业总页数", zonyeshu )
# zonyeshu2=2 #调试读取页数设置
zonyeshu2 = zonyeshu + 1 # 因为变量i,是从零加在累加,如果不加1,到最后倒数第二页就截止爬取,最后一页没有数据。
# print(type(zonyeshu2))
print(zonyeshu2)
for i in range(1, zonyeshu2):
time_random = [6,10,5,8,7,13, 9] # 设置随机延迟访问,防止封IP 为秒数,自己可适当调整。
time_test = random.choice(time_random)
yanshi=time_test
print('延迟时间', time_test)
for aa in range(yanshi):
time.sleep(1)
l2["text"] = '延迟时间:' + str(yanshi) + " / " + str(aa) + " 开始时间:" + starttime.strftime('%H:%M')
l2.update()
# time.sleep(time_test)
# welcome(time_test)
datas = {"page.pageNo": i,
"page.orderBy": "",
"page.order": "",
"province": "",
"city": "",
"registerentername": "",
"xkznum": "",
"treadname": "",
"treadcode": "",
"publishtime": ""}
# s = requests.session()#开启session保持状态
url = "http://permit.mee.gov.cn/perxxgkinfo/syssb/xkgg/xkgg!licenseInformation.action"
r = requests.post(url, headers=headers, data=datas)
html = etree.HTML(r.text)
# messagebox.showinfo("提示" )
paiwuxuke_sheng = html.xpath('//table[@class="tabtd"]/tr/td[1]/text()')[1:] # 得到排污许可证省
paiwuxuke_dishi = html.xpath('//table[@class="tabtd"]/tr/td[2]/text()')[1:] # 得到排污许可证地市
paiwuxuke_url = html.xpath('//table[@class="tabtd"]/tr/td/a/@href') # 得到排污许可证详细页面链接
paiwuxuke_id = html.xpath('//table[@class="tabtd"]/tr/td[@class="font-green"]/text()') # 得到排污许可证号码
paiwuxuke_name = html.xpath(
'//table[@class="tabtd"]/tr/td[@style="text-align: left;padding-left: 5px;"]/text()') # 得到企业名称
paiwuxuke_shenpishijian = html.xpath('//table[@class="tabtd"]/tr/td[7]/text()')[1:] # 得到排污许可证审批时间
paiwuxuke_hangye = html.xpath('//table[@class="tabtd"]/tr/td[5]/text()')[1:] # 得到排污许可证行业
paiwuxuke_youxiaoqi = html.xpath('//table[@class="tabtd"]/tr/td[6]/text()')[1:] # 得到排污许可证审批时间
if not paiwuxuke_url: # 判断IP是否被封,如果被封程序暂停。
input("IP错误,请更换!!!!!")
z1 = len(paiwuxuke_id)
z2 = len(paiwuxuke_name)
z3 = len(paiwuxuke_url)
z4 = len(paiwuxuke_youxiaoqi)
z5 = len(paiwuxuke_hangye)
z6 = len(paiwuxuke_shenpishijian)
if not z1 == z2 == z3 == z4 == z5 == z6:
print("发现有数据不一致的地方")
print('发现错误页面:' + str(i))
lerror2.append([i])
lerror["text"] = '程序发现错误页面:' + str(i)
lerror.update()
print(lerror2)
continue
# print(paiwuxuke_id)
# print(paiwuxuke_name)
# print(paiwuxuke_url)
# print(paiwuxuke_youxiaoqi )
# print(paiwuxuke_hangye)
# print(paiwuxuke_shenpishijian)
# messagebox.showinfo("总页数", zonyeshu2)
# treedata1.append([paiwuxuke_id,paiwuxuke_name,paiwuxuke_hangye,paiwuxuke_youxiaoqi,paiwuxuke_shenpishijian,paiwuxuke_url]) #全局变量中存储查询到企业的详细信息
z = len(paiwuxuke_name)
for i in range(z):
global jishuleijia
tree2.insert("", '0', jishuleijia, text="", values=(
jishuleijia, paiwuxuke_sheng[i],paiwuxuke_dishi[i],paiwuxuke_id[i], paiwuxuke_name[i], paiwuxuke_hangye[i], paiwuxuke_youxiaoqi[i],
paiwuxuke_shenpishijian[i], paiwuxuke_url[i])) # 在TREE列表中显示查询到企业的详细信息
# time.sleep(1) # 设计延时2秒
treedata_jianhuaguanli.append(
[jishuleijia, paiwuxuke_sheng[i],paiwuxuke_dishi[i],paiwuxuke_id[i], paiwuxuke_name[i], paiwuxuke_hangye[i], paiwuxuke_youxiaoqi[i],
paiwuxuke_shenpishijian[i],
paiwuxuke_url[i]])
jishuleijia = jishuleijia + 1
print(jishuleijia, "====", qiyezongshu)
l["text"] = '' + str(jishuleijia) + '/' + str(qiyezongshu)
l.update()
tree2.update()
#input("zhanting....................")
messagebox.showinfo("提示", "恭喜,所有数据都已准备完毕!请保存excel文件")
endtime = datetime.datetime.now()
seconds = (endtime - starttime).seconds
start = starttime.strftime('%Y-%m-%d %H:%M')
# 100 秒
# 分钟
minutes = seconds // 60
second = seconds % 60
print((endtime - starttime))
timeStr = str(minutes) + '分钟' + str(second) + "秒"
print("程序从 " + start + ' 开始运行,运行时间为:' + timeStr)
l2["text"] = '程序共运行时间:' + timeStr
l2.update()
def tree_click(event):
if not tree2.item(tree2.selection(), 'values'): # 判断tree2控件中是否有数据。
messagebox.showinfo("提示", '现在还没有数据!')
else:
item_text = tree2.item(tree2.selection(), 'values')[1]
messagebox.showinfo("提示", "你所选择的数据是:" + item_text)
# webbrowser.open_new_tab('http://permit.mee.gov.cn' + item_text) # 打开链接
def jiayiguanli_save():
try:
if treedata_jianhuaguanli: # 判断是否爬取到数据,是否需要保存excel文件
# wb = openpyxl.load_workbook('paiwuxukejianyiguanli.xlsx')
# ws = wb['Sheet1']
# aa = len(tree.get_children())
biaoti = [['序号', '省/直辖市', '地市','许可证编号', '企业名称', '行业类别', '有效期限', '登记时间', '详细链接']]
wb = Workbook()
wb1 = wb.create_sheet('index', 0)
wb1.title = '管理数据'
filename = tkinter.filedialog.asksaveasfilename(filetypes=[('xlsx', '*.xlsx')], initialdir='D:\\')
filename = filename + '.xlsx'
for row2 in range(len(biaoti)):
wb1.append(biaoti[row2])
for row in range(len(treedata_jianhuaguanli)):
wb1.append(treedata_jianhuaguanli[row])
#wb.save("paiwuxukejianyiguanli.xlsx")
wb.save(filename)
messagebox.showinfo("提示", "paiwuxukejianyiguanli.xlsx保存完毕~!!!")
else:
messagebox.showinfo("提示", '没有数据,不必保存')
except:
messagebox.showinfo("提示", '保存文件错误,请重试~!!')
root = Tk() # 创建窗口
root.title("排污许可证数据信息")
root.geometry("900x750+500+50") # 小写x代表乘号500x400为窗口大小,+0+0窗口显示位置
lbxianshixinxi = LabelFrame(root, width=800, text='', padx=80, pady=10)#, labelanchor=W
lbxianshixinxi.grid(row=0, column=0)
l = Label(lbxianshixinxi, text='0/0', width=20) # 创建标签控件
l.grid(row=0, column=0,padx=10, pady=10,sticky=W)
l2 = Label(lbxianshixinxi, text='程序运行时间:', justify=RIGHT) # 创建标签控件
l2.grid(row=0, column=1,padx=10, pady=10,sticky=W)
lerror = Label(lbxianshixinxi, text='', width=25, fg='red',justify=RIGHT) # 创建标签控件
lerror.grid(row=0, column=2,padx=10, pady=10,sticky=W)
lbtree = LabelFrame(root, width=500, height=10, text='数据显示区域', padx=8, pady=10)
lbtree.grid(row=1, column=0)
lb4 = LabelFrame(root, width=800, height=500, text='重点管理排污许可证数据', padx=8, pady=8, foreground='red')
lb4.grid(row=8, column=0)
button = Button(lb4, text=" 开始爬取(重点管理) ", command=kaishipaqu_begin) # 创建按钮控件
button.grid(row=5, column=2, padx=20, pady=0,sticky=W)
button2 = Button(lb4, text=" 保存列表信息数据 ", command=jiayiguanli_save) # 创建按钮控件
button2.grid(row=5, column=4, sticky=W)
button1 = Button(lb4, text=" 退 出 ", command=root.quit) # 创建按钮控件
button1.grid(row=5, column=5, padx=20, pady=0,sticky=N)
tree2 = ttk.Treeview(lbtree, height=20, show="headings") # 表格第一列不显示
scroll2_ty = Scrollbar(root, orient=VERTICAL, command=tree2.yview) # 添加滚动条
scroll2_ty.grid(row=1, column=2, sticky=N + S)
tree2['yscrollcommand'] = scroll2_ty.set
scroll2_tx = Scrollbar(root, orient=HORIZONTAL, command=tree2.xview)
scroll2_tx.grid(row=3, column=0, sticky=E + W)
tree2['xscrollcommand'] = scroll2_tx.set
tree2.grid(row=1, columnspan=1)
tree2["columns"] = ('序号', '省/直辖市', '地市','许可证编号', '企业名称', '行业类别', '有效期限', '登记时间', '详细链接')
# 设置列,不显示
tree2.column("序号", width=50)
tree2.column("省/直辖市", width=100)
tree2.column("地市", width=100)
tree2.column("许可证编号", width=100)
tree2.column("企业名称", width=100)
tree2.column("行业类别", width=100)
tree2.column("有效期限", width=100)
tree2.column("登记时间", width=100)
tree2.column("详细链接", width=100)
# 显示表头
tree2.heading("序号", text="序号")
tree2.heading("省/直辖市", text="省/直辖市")
tree2.heading("地市", text="地市")
tree2.heading("许可证编号", text="许可证编号")
tree2.heading("企业名称", text="企业名称")
tree2.heading("行业类别", text="行业类别")
tree2.heading("有效期限", text="有效期限")
tree2.heading("登记时间", text="登记时间")
tree2.heading("详细链接", text="详细链接")
tree2.bind('<ButtonRelease>', tree_click) # 列表框绑定鼠标事件函数
root.mainloop() # 显示窗口 mainloop 消息循环
更多推荐
已为社区贡献2条内容
所有评论(0)