Python爬取古风漫画网

#!/user/bin/python# -*- coding: utf-8 -*-import requestsfrom bs4 import BeautifulSoupfrom urllib import requestimport timeimport osfrom concurrent.futures import ThreadPoolExecutor# 初始化环境r...

艾渃曼丶

2609人浏览 · 2019-07-13 09:46:19

艾渃曼丶 · 2019-07-13 09:46:19 发布

#!/user/bin/python
# -*- coding: utf-8 -*-

import requests
from bs4 import BeautifulSoup
from urllib import request
import time
import os
from concurrent.futures import ThreadPoolExecutor

# 初始化环境
rootPath = "D:\Comic"
header = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
setname = "http://m.gufengmh8.com"
# startChapter = 0

# 获得要爬取的url
url = "http://m.gufengmh8.com/manhua/shenyongjianglin/"
print("请输入要下载漫画目录页面的地址：")
#url = input()
print("请输入开始章节（若下载整本请填1）：")
#startChapter = int(input()) - 1
startChapter = 366

#将url拼接成移动端url
if url[8] == 'w':
    url = "http://m" + url[10:]

print(url)

# 访问目录url获取漫画标题
req = requests.get(url, header)
result = req.content
result = result.decode("utf-8")
soup = BeautifulSoup(result, 'html5lib')
titleName = soup.title.string

# 查找章节列表标签
chapter = soup.find_all(id="chapter-list-1")[0]
chapterUrlList = chapter.find_all("a")
chapterNameList = chapter.find_all("span")

# 获取章节url
chapterUrl = []
for line in chapterUrlList:
    list.append(chapterUrl, line.get("href"))

# 获取章节名
chapterName = []
for line in chapterNameList:
    s = line.string.replace(":", "：")
    list.append(chapterName, s)

print(chapterUrl)
print(chapterName)

# 设置章节标志
step = 0

def download_chapter(url, CN):

    print("正在下载章节：" + CN)

    # 检测并创建当前章节物理目录
    path = rootPath + "\\" + titleName + "\\" + CN + "\\"
    if not os.path.exists(path):
        os.makedirs(path)

    # 访问当前章节 获取章节页数 并延时0.3秒
    url = setname + url
    req = requests.get(url, header)
    req = req.content
    req = req.decode('utf-8', 'ingore')
    soup = BeautifulSoup(req, "html5lib")
    chapterPageNumber = soup.find_all(id="k_total")
    if chapterPageNumber.__len__() == 0:
        chapterPageNumber = soup.find_all(id="total-page")[0].string
    else:
        chapterPageNumber = chapterPageNumber[0].string
    time.sleep(0.3)

    # 分割url
    urlPage = url[:-5]

    # 设置当前章节内页码
    nowChapterPage = 1

    while True:
        # 拼接当前页url
        url = urlPage + "-%d" % nowChapterPage + ".html"

        # 访问当前页url
        req = requests.get(url, header)
        result = req.content
        result = result.decode("utf-8")
        soup = BeautifulSoup(result, "html5lib")

        # 获取当前页的图片url
        imgUrl = soup.find_all("mip-img")
        if imgUrl.__len__() == 0:
            imgUrl = imgUrl = soup.find_all("img")[0]
        else:
            imgUrl = imgUrl[0]
        imgUrl = imgUrl.get("src")
        # print(path)

        # 保存图片
        try:
            request.urlretrieve(imgUrl, path + "%d.jpg" % nowChapterPage)
        except Exception:
            print()
            print("漫画：" + CN + "第%d页下载失败" % nowChapterPage)
            # 检测当前页是否为最后一页
            if nowChapterPage == int(chapterPageNumber):
                print(CN + "  下载完成")
                break

            # 页码+1 并延时0.2秒
            nowChapterPage = nowChapterPage + 1
            time.sleep(0.2)
            continue

        # 打印下载进度
        #print("\r该章节已下载： %.1f%%" % (nowChapterPage * 100 / int(chapterPageNumber)), end="", flush=True)

        # 检测当前页是否为最后一页
        if nowChapterPage == int(chapterPageNumber):
            print()
            print(CN + "  下载完成")
            break

        # 页码+1 并延时0.2秒
        nowChapterPage = nowChapterPage + 1
        time.sleep(0.2)

    print(CN+'    下载完成')



# 创建容量为10的线程池
pool = ThreadPoolExecutor(10)
# 循环访问各章节url
for url in chapterUrl:
    # 当前为最够一章时 结束
    if step + startChapter == chapterUrl.__len__():
        break

    # 获取当前章节名
    CN = chapterName[step + startChapter]
    url = chapterUrl[step + startChapter]

    # 去线程池中获取一个线程,线程去执行print_num方法
    pool.submit(download_chapter, url, CN)

    # 章节标志位+1
    time.sleep(0.2)
    step = step + 1





# print(titleName + "  已下载完成")
# print("漫画保存在: " + rootPath + "  目录下")
# print("按回车键退出....")

# exit = input()



''' code update log
2018/11/30  1.将soup的构建完全改为使用移动端网址 
            2.添加了对不同页面结构的章节总页面数和图片url标签的判断
            3.添加了页面图片获取失败时的错误处理

2019/1/14   1.引入动态下载进度显示
            2.修复章节名称含有英文冒号时 会产生错误的物理存储地址的bug

2019/2/22   1.采用10容量线程池改写代码 
        
'''

CSDN学习社区

CSDN联合极客时间，共同打造面向开发者的精品内容学习社区，助力成长！

更多推荐