pyhton爬取武汉地铁信息

话不多说,直接上代码

from lxml import etree
import urllib3.request
import pandas as pd
urllib3.disable_warnings()


# 生成待解析的对象
def getTree(url):
    pool_manager = urllib3.PoolManager()
    response = pool_manager.request('GET', url)
    r = response.data.decode()
    return etree.HTML(r)


def page(url):
    try:
        tree = getTree(url)
        metro_name = tree.xpath('//div[@class="ib-hd lm-hd"]/*/text()')
        metro_stations = tree.xpath('//ul[@class="clear"]/li/a[@class="cl-station"]/text()')
        # print(metro_name,metro_stations, sep='\n')
        return metro_name, metro_stations
    except Exception:
        pass


def getEntrance(homeUrl):
    tree = getTree(homeUrl)
    div = tree.xpath('//div[@class="ib-box"]')[0]
    title = div.xpath('//div[@class="ib-hd"]/text()')[0]
    line_name = div.xpath('//ul/li/a/text()')
    page_links = div.xpath('//ul/li/a/@href')
    line_info = div.xpath('//ul/li/div//text()')
    # print(line_info)
    run_time = line_info[1::4]
    update_time = line_info[3::4]
    # print(title, line_name, page_links, run_time,update_time, sep='\n')
    metro_counts = []
    metro_stations = []
    for page_link in page_links:
        metro_count, metro_station = page("https://dt.8684.cn/" + page_link)
        metro_counts.append(metro_count[1])
        metro_stations.append(metro_station)
    # 之后运用pandas的数据框进行处理
    data = {'line_name': line_name, 'run_time': run_time, 'update_time': update_time, 'metro_count':
        metro_counts, 'metro_stations': metro_stations}

    # 每一项数据的合并
    df = pd.DataFrame(data)
    # print(df)
    df.to_excel('data.xls')
    print('finished!')



homeUrl = 'https://dt.8684.cn/wh_list_time'

if __name__ == '__main__':
    getEntrance(homeUrl)

效果

在这里插入图片描述

Logo

为武汉地区的开发者提供学习、交流和合作的平台。社区聚集了众多技术爱好者和专业人士,涵盖了多个领域,包括人工智能、大数据、云计算、区块链等。社区定期举办技术分享、培训和活动,为开发者提供更多的学习和交流机会。

更多推荐