Python3异步爬虫(asyncio的速度和串行差不多。。。这)
import asyncio, requests, pymongoimport re, timeclass asycioSpider(object):"""docstring for asycioSpider"""headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) ...
·
import asyncio, requests, pymongo
import re, time
class asycioSpider(object):
"""docstring for asycioSpider"""
headers = {'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Mobile Safari/537.36'}
def __init__(self, outline, regexBody):
super(asycioSpider, self).__init__()
self.regex = outline[0]
self.regexName = outline[1]
self.regexBody = regexBody[0]
self.regexBodyName = regexBody[1]
# 请求头文件
# 数据库初始化
def pymongoSpider(self, ip, dbName):
client = pymongo.MongoClient(ip)
db = client[dbName]
return db
# 获取一本的url
def novelLinks(self, url):
res = requests.get(url, headers=self.headers)
res.encoding = res.apparent_encoding
print("小说大纲页下载成功!")
links = re.findall(self.regex, res.text, re.S|re.I|re.M)
links = iter([url + link.split('/')[-1] for link in links])
name = re.findall(self.regexName, res.text, re.S|re.I|re.M)
print("小说大纲页数据处理成功!")
return (name[0], links)
@asyncio.coroutine
def requestsData(self, url):
return requests.get(url, headers=self.headers)
@asyncio.coroutine
def findallData(self, regex, res):
return re.findall(regex, res.text, re.S|re.I|re.M)
# 获取一章小说
@asyncio.coroutine
def novelBody(self, url):
# start = time.time()
res = yield from self.requestsData(url)
# print("requests: {}".format(time.time() - start))
# start = time.time()
res.encoding = res.apparent_encoding
# print("encoding: {}".format(time.time() - start))
# start = time.time()
title = yield from self.findallData(self.regexName, res)
# print("findall chapterName: {}".format(time.time() - start))
# start = time.time()
body = yield from self.findallData(self.regexBody, res)
# print("findall chapterBody: {}".format(time.time() - start))
print(title, "下载成功!")
return {'url': url, 'title': title[0], 'body': title[0] + '\n' + body[0]}
@asyncio.coroutine
def pymongoSave(self, url, name, db):
# start = time.time()
data = yield from self.novelBody(url)
yield from self.saveDB(name, db, data)
# print("saveDB: {}".format(time.time() - start))
@asyncio.coroutine
def saveDB(self, name, db, data):
db[name].insert_one(data)
# 开始运行
def start(self, url, ip, dbName):
print("爬虫启动...")
novelName, links = self.novelLinks(url)
print("开始创建数据库...")
db = self.pymongoSpider(ip, dbName)
print("数据库创建完毕!")
print("异步爬虫启动...")
loop = asyncio.get_event_loop()
print("异步爬虫下载中...")
tasks = [self.pymongoSave(link, novelName, db) for link in links]
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
print("异步爬虫关闭!")
print(dbName + ' 下载完成!')
if __name__ == '__main__':
import time
start_time = time.time()
outline = (r"<dd><a href='(.*?)' >.*?</a></dd>", r'<h1>(.*?)</h1>')
regexBody = (r'<div id="content">(.*?)<p>.*?</p></div>', r'<h1>(.*?)</h1>')
url = r'http://www.xbiquge.la/10/10489/'
ip = 'localhost:27017'
spider = asycioSpider(outline, regexBody)
spider.start(url, ip, 'biquge')
print("The Program run end {}".format(time.time() - start_time))
更多推荐
已为社区贡献1条内容
所有评论(0)