pyhton爬取武汉地铁信息
pyhton爬取武汉地铁信息话不多说,直接上代码from lxml import etreeimport urllib3.requestimport pandas as pdurllib3.disable_warnings()# 生成待解析的对象def getTree(url):pool_manager = urllib3.PoolManager()response = pool_manager.
·
pyhton爬取武汉地铁信息
话不多说,直接上代码
from lxml import etree
import urllib3.request
import pandas as pd
urllib3.disable_warnings()
# 生成待解析的对象
def getTree(url):
pool_manager = urllib3.PoolManager()
response = pool_manager.request('GET', url)
r = response.data.decode()
return etree.HTML(r)
def page(url):
try:
tree = getTree(url)
metro_name = tree.xpath('//div[@class="ib-hd lm-hd"]/*/text()')
metro_stations = tree.xpath('//ul[@class="clear"]/li/a[@class="cl-station"]/text()')
# print(metro_name,metro_stations, sep='\n')
return metro_name, metro_stations
except Exception:
pass
def getEntrance(homeUrl):
tree = getTree(homeUrl)
div = tree.xpath('//div[@class="ib-box"]')[0]
title = div.xpath('//div[@class="ib-hd"]/text()')[0]
line_name = div.xpath('//ul/li/a/text()')
page_links = div.xpath('//ul/li/a/@href')
line_info = div.xpath('//ul/li/div//text()')
# print(line_info)
run_time = line_info[1::4]
update_time = line_info[3::4]
# print(title, line_name, page_links, run_time,update_time, sep='\n')
metro_counts = []
metro_stations = []
for page_link in page_links:
metro_count, metro_station = page("https://dt.8684.cn/" + page_link)
metro_counts.append(metro_count[1])
metro_stations.append(metro_station)
# 之后运用pandas的数据框进行处理
data = {'line_name': line_name, 'run_time': run_time, 'update_time': update_time, 'metro_count':
metro_counts, 'metro_stations': metro_stations}
# 每一项数据的合并
df = pd.DataFrame(data)
# print(df)
df.to_excel('data.xls')
print('finished!')
homeUrl = 'https://dt.8684.cn/wh_list_time'
if __name__ == '__main__':
getEntrance(homeUrl)
效果
为武汉地区的开发者提供学习、交流和合作的平台。社区聚集了众多技术爱好者和专业人士,涵盖了多个领域,包括人工智能、大数据、云计算、区块链等。社区定期举办技术分享、培训和活动,为开发者提供更多的学习和交流机会。
更多推荐
已为社区贡献1条内容
所有评论(0)