Amazon亚马逊跨境电商产品图片爬取

从BestSeller中获取热销排行产品链接

# -*- codeing = utf-8 -*-
# @Time: 2021/1/10 11:50
# @File: Amazon_pic.py
# @Sofetware: PyCharm
import os
import time
import urllib.request
import re
from lxml import etree

import requests
from bs4 import BeautifulSoup
import xlwt
import sqlite3

def save_proLink(link):
    print("save...")
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet("前排行榜", cell_overwrite_ok= True) #默认是不可复写的
    for i in range(0, len(link)):
        data = link[i]
        sheet.write(i, 0, data)
        print("成功写入第%d", i+1)
    print("完成")

    book.save('1.xls')


def askURl(baseurl):
    headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36"
    }
    req = urllib.request.Request(url=baseurl, headers=headers)
    html = ""
    try:
        response = urllib.request.urlopen(req)
        html = response.read().decode('utf-8')
    except urllib.error.URLError as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)
    return html

def readlocal():
    pro_links = []
    local_link_html = open("1.html", 'rb')
    bs = BeautifulSoup(local_link_html, 'lxml')
    item = bs.select(".a-link-normal")  # 该类下的所有html语句

    model = re.compile(r'<a class="a-link-normal" href="(.*?)>')
    link = re.findall(model, str(item))  # 正则表达式只能查找字符串类型
    for i in link:
        a_link = "https://www.amazon.co.uk" + i
        pro_links.append(a_link)
    return pro_links

def readonline():
    for i in range(0, 1):
        html = askURl(baseurl)
        bs = BeautifulSoup(html, 'lxml')
        link_html = bs.select(".aok-inline-block.zg-item > .a-link-normal")
        # print(link_html)
        with open(f'1.html', 'w', encoding='utf-8') as f:
            f.write(str(link_html))
        f.close()


def find_imgs(pro_link):
    id = re.findall(r'dp/(.+?)/', pro_link, re.S)[0]
    head = {
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}

    print(f'商品ASIN为:{id},图片链接正在采集,请稍后..')
    response = requests.get(pro_link, headers=head, timeout=8)

    html = response.content.decode('utf-8')
    with open(f'{id}.html', 'w', encoding='utf-8') as f:
        f.write(html)

    imgs_text = re.findall(r'ImageBlockATF(.+?)return data;', html, re.S)[0]
    imgs = re.findall(r'"large":"(.+?)","main":', imgs_text, re.S)
    return imgs

def down_img(imgs):
    i = 0
    for url in imgs:
        if i == len(imgs):
            break
        else:
            r = requests.get(url, stream=True)
            img_name = url.split('/')[-1]
            print(img_name)
            with open('C:\\Users\\DELL\\Pictures\\Saved Pictures\\amazon\%s' % img_name, 'wb')as f:
                # for chunk in r.iter_content(chunk_size=32):
                #     f.write(chunk)
                f.write(r.content)
                f.close()
                print(f"成功下载第{i+1}张图片")

        i = i + 1


def Save_All_Data(pro_links):
    i = 0
    for link in pro_links:
        if i == 1:
            break
        else:
            imgs = find_imgs(link)
            for img in imgs:
                down_img(img)
        i = i + 1


if __name__ == '__main__':
    baseurl = "https://www.amazon.co.uk/gp/bestsellers/pet-supplies/13154166031/"
    if os.path.exists("1.html"):
        print("本地爬取")
    else:
        print("第一次爬取")
        readonline()
    pro_links = readlocal()
    print("成功获取热销产品链接")
    imgs = find_imgs(pro_links[0])
    print("成功获取图片链接")
    down_img(imgs)
    #Save_All_Data(pro_links)
    #save_proLink(pro_links)


Logo

亚马逊云科技开发者 Build On 是由亚马逊团队策划、开发者社区联合打造的动手实操系列活动。

更多推荐