[python爬虫]如何爬取特定网页的图片
#!/usr/bin/env python# -*- coding:utf-8 -*-"""Created on Sun Aug 02 20:10:36 2015@author: lijiong"""import urllibimport sysimport chardetimport redef get_html(url):page = urllib.ur
·
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
Created on Sun Aug 02 20:10:36 2015
@author: lijiong
"""
import urllib
import sys
import chardet
import re
def get_html(url):
page = urllib.urlopen(url)
content = page.read()
typeEncode = sys.getfilesystemencoding()
infoencode = chardet.detect(content).get('encoding','utf-8')
html = content.decode(infoencode,'ignore').encode(typeEncode)
return html #提取html页面,编码已经转换
def get_img(html):
reg = r'src="(http://imgsrc.baidu.com/forum/w.*?\.jpg)"'
imgre = re.compile(reg)
imglist = re.findall(imgre, html) #表达式中只有一个括号时,findall只会返回括号的内容
i = 0
for imgurl in imglist:
print imgurl
urllib.urlretrieve(imgurl, 'C:/Users/lijiong/Desktop/new/%s.jpg'%i)
i+=1
html = get_html('http://tieba.baidu.com/p/3837885162')
get_img(html)
更多推荐
已为社区贡献1条内容
所有评论(0)