import asyncio, requests, pymongo
import re, time
class asycioSpider(object):
    """docstring for asycioSpider"""
    headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36'}
    def __init__(self, outline, regexBody):
        super(asycioSpider, self).__init__()
        self.regex = outline[0]
        self.regexName = outline[1]
        self.regexBody = regexBody[0]
        self.regexBodyName = regexBody[1]
        # 请求头文件
    # 数据库初始化
    def pymongoSpider(self, ip, dbName): 
        client = pymongo.MongoClient(ip)
        db = client[dbName]
        return db
    # 获取一本的url
    def novelLinks(self, url):
        res = requests.get(url, headers=self.headers)
        res.encoding = res.apparent_encoding
        print("小说大纲页下载成功!")
        links = re.findall(self.regex, res.text, re.S|re.I|re.M)
        links = iter([url + link.split('/')[-1] for link in links])
        name  = re.findall(self.regexName, res.text, re.S|re.I|re.M)
        print("小说大纲页数据处理成功!")
        return (name[0], links)
    @asyncio.coroutine
    def requestsData(self, url):
        return requests.get(url, headers=self.headers)
    @asyncio.coroutine
    def findallData(self, regex, res):
        return re.findall(regex, res.text, re.S|re.I|re.M)
    # 获取一章小说
    @asyncio.coroutine
    def novelBody(self, url):
        # start = time.time()
        res = yield from self.requestsData(url)
        # print("requests: {}".format(time.time() - start))
        # start = time.time()
        res.encoding = res.apparent_encoding
        # print("encoding: {}".format(time.time() - start))
        # start = time.time()
        title = yield from self.findallData(self.regexName, res)
        # print("findall chapterName: {}".format(time.time() - start))
        # start = time.time()
        body  = yield from self.findallData(self.regexBody, res)
        # print("findall chapterBody: {}".format(time.time() - start))
        print(title, "下载成功!")
        return {'url': url, 'title': title[0], 'body': title[0] + '\n' + body[0]}
    @asyncio.coroutine
    def pymongoSave(self, url, name, db):
        # start = time.time()
        data = yield from self.novelBody(url)
        yield from self.saveDB(name, db, data)
        # print("saveDB: {}".format(time.time() - start))
    @asyncio.coroutine
    def saveDB(self, name, db, data):
        db[name].insert_one(data)
    # 开始运行
    def start(self, url, ip, dbName):
        print("爬虫启动...")
        novelName, links = self.novelLinks(url)
        print("开始创建数据库...")
        db = self.pymongoSpider(ip, dbName)
        print("数据库创建完毕!")
        print("异步爬虫启动...")
        loop = asyncio.get_event_loop()
        print("异步爬虫下载中...")
        tasks = [self.pymongoSave(link, novelName, db) for link in links]
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
        print("异步爬虫关闭!")
        print(dbName + ' 下载完成!')
if __name__ == '__main__':
    import time
    start_time = time.time()
    outline = (r"<dd><a href='(.*?)' >.*?</a></dd>", r'<h1>(.*?)</h1>')
    regexBody = (r'<div id="content">(.*?)<p>.*?</p></div>', r'<h1>(.*?)</h1>')
    url = r'http://www.xbiquge.la/10/10489/'
    ip  = 'localhost:27017'
    spider = asycioSpider(outline, regexBody)
    spider.start(url, ip, 'biquge')
    print("The Program run end {}".format(time.time() - start_time))
Logo

更多推荐