python 多进程爬虫抓取小说

最近在学习爬虫，这是一个多进程的例子，基本都是网络搜索而来，没有署名，抱歉！# !/usr/bin/env python# -*- coding: utf-8 -*-# ！/usr/bin/env python# -*- coding:utf -8-*-import timeimport multiprocessingfrom bs4 import BeautifulSo...

sandorn

636人浏览 · 2019-05-18 11:45:44

sandorn · 2019-05-18 11:45:44 发布

最近在学习爬虫，这是一个多进程的例子，基本都是网络搜索而来，没有署名，抱歉！

# !/usr/bin/env python
# -*- coding: utf-8 -*-
# ！/usr/bin/env python
# -*- coding:utf -8-*-

import time
import multiprocessing
from bs4 import BeautifulSoup
from retrying import retry
import requests

myhead = {
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    'Accept': '*/*',
    'Accept-Encoding': 'gzip,deflate,sdch, br',
    'Accept-Language': 'zh-TW,zh;q=0.8,en-US;q=0.6,en;q=0.4',
    'Cache-Control': 'max-age=0',
    'Connection': 'close',
    'Proxy-Connection': 'no-cache',
    'Host': 'www.biqukan.com'
}


def parse_url(url):

    @retry(wait_random_min=200,
           wait_random_max=3000,
           stop_max_attempt_number=100)
    # stop_max_delay=1000
    # wait_random_min, wait_random_max 设置失败重试随机性间隔时间
    def _parse_url(url):
        header = myhead
        response = requests.get(url, headers=header, timeout=6)
        assert response.status_code == 200
        return response  # .content.decode('gbk')

    try:
        # 以下except捕获当requests请求异常
        response = _parse_url(url)
        htmlTree = BeautifulSoup(response.text, 'html5lib')
    except requests.exceptions.ConnectionError as e:
        print('ConnectionError:', e, url, flush=True)
        htmlTree = None
    except requests.exceptions.ChunkedEncodingError as e:
        print('ChunkedEncodingError:', e, url, flush=True)
        htmlTree = None
    except Exception as e:
        print('Unfortunitely Unknow Error:', e, url, flush=True)
        htmlTree = None
    return htmlTree


def writer(_filename, 内容):
    # 函数说明:将爬取的文章内容写入文件
    print('[' + _filename + ']开始保存......', end='', flush=True)
    内容.sort()
    with open(_filename, 'a', encoding='utf-8') as f:
        for i in 内容:
            f.write(i[1] + '\n' + i[2] + '\n')
    print('[' + _filename + ']保存完成。', flush=True)


def get_stime(bool=True):
    ct = time.time()
    local_time = time.localtime(ct)
    data_head = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
    data_secs = (ct - int(ct)) * 1000
    time_stamp = "%s.%03d" % (data_head, data_secs)
    stamp = ("".join(time_stamp.split()[0].split("-")) + "".join(time_stamp.split()[1].split(":"))).replace('.', '')

    if bool:
        return stamp
    else:
        return time_stamp


def get_download_url(target):
    url_list = []
    _response = parse_url(target)
    _bookname = _response.find('h2').get_text()
    # 搜索文档树,找出div标签中class为listmain的所有子标签
    _div = str(_response.find_all('div', class_='listmain')[0])
    download_soup = BeautifulSoup(_div, features="html5lib")

    # 开始记录内容标志位,只要正文卷下面的链接,最新章节列表链接剔除
    begin_flag = False

    # 遍历 dl 标签下所有子节点
    for child in download_soup.dl.children:
        # 找到正文卷,使能标志位
        if child.string.strip() == '《' + _bookname + '》正文卷':
            begin_flag = True
        # 爬取链接并下载链接内容
        if begin_flag and child.name == 'dd':
            download_url = 'http://www.biqukan.com/' + child.find('a').get('href')
            url_list.append(download_url)
    return _bookname, url_list


def get_contents(lock, index, url):
    _texts = ''
    _response = parse_url(url)
    _name = _response.h1.get_text()  # 章节名
    _showtext = _response.select('.showtxt')[0]
    for text in _showtext.stripped_strings:
        _texts += text + '\n'

    with lock:
        print('{} done with {} at {}'.format(multiprocessing.current_process().name, index, get_stime()), flush=True)
    return [index, _name, _texts]


def main_Pool(target):
    _stime = time.time()
    # 父进程创建lock，传给个子进程：
    lock = multiprocessing.Manager().Lock()

    print('开始下载：《{}》\t{}\t获取下载链接......'.format(target, get_stime()), flush=True)
    bookname, urls = get_download_url(target)
    print('multiprocessing.pool，开始下载：《' + bookname + '》', flush=True)
    mypool = multiprocessing.Pool(20)  # !进程数
    # 创建多进程队列
    future_tasks = []
    for i in range(len(urls)):
        item = mypool.apply_async(get_contents, args=(lock, i, urls[i]))
        # pool.apply_async(func=task,callback=_callba)
        future_tasks.append(item)
    mypool.close()  # 关闭进程池,不再接受请求
    mypool.join()  # 等待进程池中的事件执行完毕，回收进程池

    texts = []  # 将爬下来的小说都存在里面
    for i, item in enumerate(future_tasks):
        _text = item.get()  # join后获取进程返回值
        texts.append(_text)

    print('\n multiprocessing.pool，书籍《' + bookname + '》完成下载', flush=True)
    writer(bookname + '.txt', texts)
    print('{} 结束，\t用时:{} 秒。'.format(get_stime(), round(time.time() - _stime, 2)), flush=True)


if __name__ == '__main__':
    main_Pool('https://www.biqukan.com/2_2704/')

# 本人电脑用时约120秒，6239KB