Python爬取古风漫画网
#!/user/bin/python# -*- coding: utf-8 -*-import requestsfrom bs4 import BeautifulSoupfrom urllib import requestimport timeimport osfrom concurrent.futures import ThreadPoolExecutor# 初始化环境r...
·
#!/user/bin/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from urllib import request
import time
import os
from concurrent.futures import ThreadPoolExecutor
# 初始化环境
rootPath = "D:\Comic"
header = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
setname = "http://m.gufengmh8.com"
# startChapter = 0
# 获得要爬取的url
url = "http://m.gufengmh8.com/manhua/shenyongjianglin/"
print("请输入要下载漫画目录页面的地址:")
#url = input()
print("请输入开始章节(若下载整本请填1):")
#startChapter = int(input()) - 1
startChapter = 366
#将url拼接成移动端url
if url[8] == 'w':
url = "http://m" + url[10:]
print(url)
# 访问目录url获取漫画标题
req = requests.get(url, header)
result = req.content
result = result.decode("utf-8")
soup = BeautifulSoup(result, 'html5lib')
titleName = soup.title.string
# 查找章节列表标签
chapter = soup.find_all(id="chapter-list-1")[0]
chapterUrlList = chapter.find_all("a")
chapterNameList = chapter.find_all("span")
# 获取章节url
chapterUrl = []
for line in chapterUrlList:
list.append(chapterUrl, line.get("href"))
# 获取章节名
chapterName = []
for line in chapterNameList:
s = line.string.replace(":", ":")
list.append(chapterName, s)
print(chapterUrl)
print(chapterName)
# 设置章节标志
step = 0
def download_chapter(url, CN):
print("正在下载章节:" + CN)
# 检测并创建当前章节物理目录
path = rootPath + "\\" + titleName + "\\" + CN + "\\"
if not os.path.exists(path):
os.makedirs(path)
# 访问当前章节 获取章节页数 并延时0.3秒
url = setname + url
req = requests.get(url, header)
req = req.content
req = req.decode('utf-8', 'ingore')
soup = BeautifulSoup(req, "html5lib")
chapterPageNumber = soup.find_all(id="k_total")
if chapterPageNumber.__len__() == 0:
chapterPageNumber = soup.find_all(id="total-page")[0].string
else:
chapterPageNumber = chapterPageNumber[0].string
time.sleep(0.3)
# 分割url
urlPage = url[:-5]
# 设置当前章节内页码
nowChapterPage = 1
while True:
# 拼接当前页url
url = urlPage + "-%d" % nowChapterPage + ".html"
# 访问当前页url
req = requests.get(url, header)
result = req.content
result = result.decode("utf-8")
soup = BeautifulSoup(result, "html5lib")
# 获取当前页的图片url
imgUrl = soup.find_all("mip-img")
if imgUrl.__len__() == 0:
imgUrl = imgUrl = soup.find_all("img")[0]
else:
imgUrl = imgUrl[0]
imgUrl = imgUrl.get("src")
# print(path)
# 保存图片
try:
request.urlretrieve(imgUrl, path + "%d.jpg" % nowChapterPage)
except Exception:
print()
print("漫画:" + CN + "第%d页下载失败" % nowChapterPage)
# 检测当前页是否为最后一页
if nowChapterPage == int(chapterPageNumber):
print(CN + " 下载完成")
break
# 页码+1 并延时0.2秒
nowChapterPage = nowChapterPage + 1
time.sleep(0.2)
continue
# 打印下载进度
#print("\r该章节已下载: %.1f%%" % (nowChapterPage * 100 / int(chapterPageNumber)), end="", flush=True)
# 检测当前页是否为最后一页
if nowChapterPage == int(chapterPageNumber):
print()
print(CN + " 下载完成")
break
# 页码+1 并延时0.2秒
nowChapterPage = nowChapterPage + 1
time.sleep(0.2)
print(CN+' 下载完成')
# 创建容量为10的线程池
pool = ThreadPoolExecutor(10)
# 循环访问各章节url
for url in chapterUrl:
# 当前为最够一章时 结束
if step + startChapter == chapterUrl.__len__():
break
# 获取当前章节名
CN = chapterName[step + startChapter]
url = chapterUrl[step + startChapter]
# 去线程池中获取一个线程,线程去执行print_num方法
pool.submit(download_chapter, url, CN)
# 章节标志位+1
time.sleep(0.2)
step = step + 1
# print(titleName + " 已下载完成")
# print("漫画保存在: " + rootPath + " 目录下")
# print("按回车键退出....")
# exit = input()
''' code update log
2018/11/30 1.将soup的构建完全改为使用移动端网址
2.添加了对不同页面结构的章节总页面数和图片url标签的判断
3.添加了页面图片获取失败时的错误处理
2019/1/14 1.引入动态下载进度显示
2.修复章节名称含有英文冒号时 会产生错误的物理存储地址的bug
2019/2/22 1.采用10容量线程池改写代码
'''
更多推荐
已为社区贡献6条内容
所有评论(0)