Python3异步爬虫（asyncio的速度和串行差不多。。。这）

import asyncio, requests, pymongoimport re, timeclass asycioSpider(object):"""docstring for asycioSpider"""headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) ...

望向缸外的大眼睛

3457人浏览 · 2019-04-07 20:57:22

望向缸外的大眼睛 · 2019-04-07 20:57:22 发布

import asyncio, requests, pymongo
import re, time
class asycioSpider(object):
    """docstring for asycioSpider"""
    headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36'}
    def __init__(self, outline, regexBody):
        super(asycioSpider, self).__init__()
        self.regex = outline[0]
        self.regexName = outline[1]
        self.regexBody = regexBody[0]
        self.regexBodyName = regexBody[1]
        # 请求头文件
    # 数据库初始化
    def pymongoSpider(self, ip, dbName): 
        client = pymongo.MongoClient(ip)
        db = client[dbName]
        return db
    # 获取一本的url
    def novelLinks(self, url):
        res = requests.get(url, headers=self.headers)
        res.encoding = res.apparent_encoding
        print("小说大纲页下载成功!")
        links = re.findall(self.regex, res.text, re.S|re.I|re.M)
        links = iter([url + link.split('/')[-1] for link in links])
        name  = re.findall(self.regexName, res.text, re.S|re.I|re.M)
        print("小说大纲页数据处理成功！")
        return (name[0], links)
    @asyncio.coroutine
    def requestsData(self, url):
        return requests.get(url, headers=self.headers)
    @asyncio.coroutine
    def findallData(self, regex, res):
        return re.findall(regex, res.text, re.S|re.I|re.M)
    # 获取一章小说
    @asyncio.coroutine
    def novelBody(self, url):
        # start = time.time()
        res = yield from self.requestsData(url)
        # print("requests: {}".format(time.time() - start))
        # start = time.time()
        res.encoding = res.apparent_encoding
        # print("encoding: {}".format(time.time() - start))
        # start = time.time()
        title = yield from self.findallData(self.regexName, res)
        # print("findall chapterName: {}".format(time.time() - start))
        # start = time.time()
        body  = yield from self.findallData(self.regexBody, res)
        # print("findall chapterBody: {}".format(time.time() - start))
        print(title, "下载成功！")
        return {'url': url, 'title': title[0], 'body': title[0] + '\n' + body[0]}
    @asyncio.coroutine
    def pymongoSave(self, url, name, db):
        # start = time.time()
        data = yield from self.novelBody(url)
        yield from self.saveDB(name, db, data)
        # print("saveDB: {}".format(time.time() - start))
    @asyncio.coroutine
    def saveDB(self, name, db, data):
        db[name].insert_one(data)
    # 开始运行
    def start(self, url, ip, dbName):
        print("爬虫启动...")
        novelName, links = self.novelLinks(url)
        print("开始创建数据库...")
        db = self.pymongoSpider(ip, dbName)
        print("数据库创建完毕！")
        print("异步爬虫启动...")
        loop = asyncio.get_event_loop()
        print("异步爬虫下载中...")
        tasks = [self.pymongoSave(link, novelName, db) for link in links]
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
        print("异步爬虫关闭!")
        print(dbName + ' 下载完成！')
if __name__ == '__main__':
    import time
    start_time = time.time()
    outline = (r"<dd><a href='(.*?)' >.*?</a></dd>", r'<h1>(.*?)</h1>')
    regexBody = (r'<div id="content">(.*?)<p>.*?</p></div>', r'<h1>(.*?)</h1>')
    url = r'http://www.xbiquge.la/10/10489/'
    ip  = 'localhost:27017'
    spider = asycioSpider(outline, regexBody)
    spider.start(url, ip, 'biquge')
    print("The Program run end {}".format(time.time() - start_time))

Linux

更多推荐

网卡速率和双工模式的配置

http://linux.chinaitlab.com/system/792187.html1、mii-tool 配置网络设备协商方式的工具； 1.1 mii-tool 介绍； mii-tool - view, manipulate media-independent interface status （mii-tool 是查看，管理介质的网络接口的状态）

Linux

Linux虚拟文件系统之文件系统卸载（sys_umount())

Linux中卸载文件系统由umount系统调用实现，入口函数为sys_umount()。较于文件系统的安装较为简单，下面是具体的实现。1. /*sys_umont系统调用*/2. SYSCALL_DEFINE2(umount, char __user *, name, int, flags)3. {4.struct path path;

Linux

Linux系统下超级终端Minicom的使用方法（例如：连接交换机，路由器等）转http://baike.baidu.com/view/2911642.htm?fr=ala0_1

Linux系统下超级终端Minicom的使用方法 　　Linux下的Minicom的功能与下的超级终端功能相似，适于在通过超级终端对设备的管理以及对嵌入操作系统的升级，现写出Minicom的使用手册： 　　1．启动minicom 　　以root权限登录系统 　　使用命令 　　minicom –s 则minicom启动，屏