python如何批量下载图片
本文介绍了使用Python批量下载图片的完整方案。基础版通过requests库实现单线程下载,包含URL解析、错误处理和自动目录创建功能。进阶版结合BeautifulSoup实现网页图片抓取,并提供了多线程加速方案。增强版代码添加了User-Agent头、内容类型检测、下载延迟控制等功能,支持并发下载和详细结果统计。文章还给出了最佳实践建议,包括代理支持、限速控制、断点续传等优化方向,提供了一套从
·
在Python中批量下载图片可以通过以下步骤实现,这里提供一个完整的代码示例:
基础方案:使用requests库
import os
import requests
from urllib.parse import urlparse
def download_images(image_urls, save_dir='images'):
"""
批量下载图片到指定目录
:param image_urls: 图片URL列表
:param save_dir: 保存目录(默认保存到当前目录的images文件夹)
"""
# 创建保存目录
os.makedirs(save_dir, exist_ok=True)
for url in image_urls:
try:
# 发送HTTP请求
response = requests.get(url, stream=True, timeout=5)
response.raise_for_status() # 检查请求是否成功
# 提取文件名
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
if not filename:
filename = f"image_{len(os.listdir(save_dir)) + 1}.jpg"
# 保存文件
filepath = os.path.join(save_dir, filename)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"成功下载: {filename}")
except Exception as e:
print(f"下载失败 {url} - 错误: {str(e)}")
# 示例使用
if __name__ == "__main__":
# 从网页解析图片URL(示例)
image_urls = [
"https://example.com/images/cat.jpg",
"https://example.com/images/dog.png",
"https://example.com/images/bird.webp"
]
download_images(image_urls)
进阶方案:从网页批量抓取图片
from bs4 import BeautifulSoup
import requests
def scrape_images_from_webpage(url, save_dir='images'):
"""从网页中抓取所有图片并下载"""
try:
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
image_urls = [img.get('src') for img in img_tags if img.get('src')]
download_images(image_urls, save_dir)
except Exception as e:
print(f"抓取失败: {str(e)}")
# 示例使用
scrape_images_from_webpage("https://example.com/gallery")
关键点说明:
-
依赖库安装:
pip install requests beautifulsoup4
-
核心功能:
- 自动创建保存目录
- 智能处理文件名(保留原始文件名或自动生成)
- 流式下载避免内存溢出
- 完善的错误处理
-
扩展功能建议:
# 添加多线程加速(使用concurrent.futures) from concurrent.futures import ThreadPoolExecutor # 在download_images函数中替换循环部分 with ThreadPoolExecutor(max_workers=8) as executor: executor.map(download_single_image, image_urls)
-
注意事项:
- 遵守网站robots.txt协议
- 添加User-Agent头避免被屏蔽
- 添加下载延迟防止IP被封
- 处理不同图片格式(通过MIME类型判断)
完整增强版代码
import os
import requests
from urllib.parse import urlparse
from concurrent.futures import ThreadPoolExecutor
import time
def download_single_image(url, save_dir):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, stream=True, timeout=10)
response.raise_for_status()
# 获取内容类型确定扩展名
content_type = response.headers.get('content-type')
ext = 'jpg' if 'jpeg' in content_type else content_type.split('/')[-1]
filename = os.path.basename(urlparse(url).path) or f"img_{int(time.time()*1000)}.{ext}"
filepath = os.path.join(save_dir, filename)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(8192):
f.write(chunk)
return True, filename
except Exception as e:
return False, str(e)
def batch_download(image_urls, save_dir='images', max_workers=8, delay=0.5):
os.makedirs(save_dir, exist_ok=True)
success = []
failed = []
def worker(url):
result, info = download_single_image(url, save_dir)
if result:
success.append(info)
else:
failed.append((url, info))
time.sleep(delay) # 避免请求过载
with ThreadPoolExecutor(max_workers=max_workers) as executor:
executor.map(worker, image_urls)
print(f"\n下载完成!成功: {len(success)} 失败: {len(failed)}")
return success, failed
# 使用示例
if __name__ == "__main__":
urls = [
"https://example.com/image1.jpg",
"https://example.com/image2.png",
# 添加更多URL...
]
success, failed = batch_download(urls)
print("\n成功下载列表:")
for name in success:
print(f" - {name}")
print("\n失败列表:")
for url, reason in failed:
print(f" - {url}: {reason}")
最佳实践建议:
- 添加代理支持:在requests.get中添加
proxies
参数 - 限速功能:使用
time.sleep()
控制请求频率 - 断点续传:通过检查文件大小实现
- 自动重试机制:使用
tenacity
库实现 - 文件去重:使用哈希值检查重复文件
这个方案提供了从基础到进阶的完整实现,用户可以根据实际需求选择适合的版本。代码中包含了详细的错误处理和日志输出,适合直接用于生产环境。
更多推荐
所有评论(0)