python 多进程爬虫抓取小说
最近在学习爬虫,这是一个多进程的例子,基本都是网络搜索而来,没有署名,抱歉!# !/usr/bin/env python# -*- coding: utf-8 -*-# !/usr/bin/env python# -*- coding:utf -8-*-import timeimport multiprocessingfrom bs4 import BeautifulSo...
·
最近在学习爬虫,这是一个多进程的例子,基本都是网络搜索而来,没有署名,抱歉!
# !/usr/bin/env python
# -*- coding: utf-8 -*-
# !/usr/bin/env python
# -*- coding:utf -8-*-
import time
import multiprocessing
from bs4 import BeautifulSoup
from retrying import retry
import requests
myhead = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
'Accept': '*/*',
'Accept-Encoding': 'gzip,deflate,sdch, br',
'Accept-Language': 'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4',
'Cache-Control': 'max-age=0',
'Connection': 'close',
'Proxy-Connection': 'no-cache',
'Host': 'www.biqukan.com'
}
def parse_url(url):
@retry(wait_random_min=200,
wait_random_max=3000,
stop_max_attempt_number=100)
# stop_max_delay=1000
# wait_random_min, wait_random_max 设置失败重试随机性间隔时间
def _parse_url(url):
header = myhead
response = requests.get(url, headers=header, timeout=6)
assert response.status_code == 200
return response # .content.decode('gbk')
try:
# 以下except捕获当requests请求异常
response = _parse_url(url)
htmlTree = BeautifulSoup(response.text, 'html5lib')
except requests.exceptions.ConnectionError as e:
print('ConnectionError:', e, url, flush=True)
htmlTree = None
except requests.exceptions.ChunkedEncodingError as e:
print('ChunkedEncodingError:', e, url, flush=True)
htmlTree = None
except Exception as e:
print('Unfortunitely Unknow Error:', e, url, flush=True)
htmlTree = None
return htmlTree
def writer(_filename, 内容):
# 函数说明:将爬取的文章内容写入文件
print('[' + _filename + ']开始保存......', end='', flush=True)
内容.sort()
with open(_filename, 'a', encoding='utf-8') as f:
for i in 内容:
f.write(i[1] + '\n' + i[2] + '\n')
print('[' + _filename + ']保存完成。', flush=True)
def get_stime(bool=True):
ct = time.time()
local_time = time.localtime(ct)
data_head = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
data_secs = (ct - int(ct)) * 1000
time_stamp = "%s.%03d" % (data_head, data_secs)
stamp = ("".join(time_stamp.split()[0].split("-")) + "".join(time_stamp.split()[1].split(":"))).replace('.', '')
if bool:
return stamp
else:
return time_stamp
def get_download_url(target):
url_list = []
_response = parse_url(target)
_bookname = _response.find('h2').get_text()
# 搜索文档树,找出div标签中class为listmain的所有子标签
_div = str(_response.find_all('div', class_='listmain')[0])
download_soup = BeautifulSoup(_div, features="html5lib")
# 开始记录内容标志位,只要正文卷下面的链接,最新章节列表链接剔除
begin_flag = False
# 遍历 dl 标签下所有子节点
for child in download_soup.dl.children:
# 找到正文卷,使能标志位
if child.string.strip() == '《' + _bookname + '》正文卷':
begin_flag = True
# 爬取链接并下载链接内容
if begin_flag and child.name == 'dd':
download_url = 'http://www.biqukan.com/' + child.find('a').get('href')
url_list.append(download_url)
return _bookname, url_list
def get_contents(lock, index, url):
_texts = ''
_response = parse_url(url)
_name = _response.h1.get_text() # 章节名
_showtext = _response.select('.showtxt')[0]
for text in _showtext.stripped_strings:
_texts += text + '\n'
with lock:
print('{} done with {} at {}'.format(multiprocessing.current_process().name, index, get_stime()), flush=True)
return [index, _name, _texts]
def main_Pool(target):
_stime = time.time()
# 父进程创建lock,传给个子进程:
lock = multiprocessing.Manager().Lock()
print('开始下载:《{}》\t{}\t获取下载链接......'.format(target, get_stime()), flush=True)
bookname, urls = get_download_url(target)
print('multiprocessing.pool,开始下载:《' + bookname + '》', flush=True)
mypool = multiprocessing.Pool(20) # !进程数
# 创建多进程队列
future_tasks = []
for i in range(len(urls)):
item = mypool.apply_async(get_contents, args=(lock, i, urls[i]))
# pool.apply_async(func=task,callback=_callba)
future_tasks.append(item)
mypool.close() # 关闭进程池,不再接受请求
mypool.join() # 等待进程池中的事件执行完毕,回收进程池
texts = [] # 将爬下来的小说都存在里面
for i, item in enumerate(future_tasks):
_text = item.get() # join后获取进程返回值
texts.append(_text)
print('\n multiprocessing.pool,书籍《' + bookname + '》完成下载', flush=True)
writer(bookname + '.txt', texts)
print('{} 结束,\t用时:{} 秒。'.format(get_stime(), round(time.time() - _stime, 2)), flush=True)
if __name__ == '__main__':
main_Pool('https://www.biqukan.com/2_2704/')
# 本人电脑用时约120秒,6239KB
点击阅读全文
更多推荐
目录
所有评论(0)