Answer a question

I am trying to create a dataset with images of doors and windows. For this I am interested in downloading images from one of the websites offering a vast collection. The only problem with the webpage is that to load more images I need to scroll down. Later, I will parse all the HTML content using BeautifulSoup, but I am able to download only handful of images.

import requests
import os
import argparse
import time
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager


def is_valid(url):
    """
    Checks whether the 'url' is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

def get_all_images(url):
    """
    Returns all image URLs on a single 'url'
    """
    
    options = Options()
    options.headless = True
    driver = webdriver.Firefox(options=options)
    driver.get(url)
    
    elem = driver.find_element_by_tag_name("body")
    no_of_pagedowns = 40

    while no_of_pagedowns:
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.2)
        no_of_pagedowns -= 1
    
    # beautiful soup allows parsing html of the webpage
    soup = bs(driver.page_source, "html.parser")
    driver.quit()

    urls = []
    for fig in tqdm(soup.find_all("figure"), "Extracting images"):
        try: 
            img_url = fig.find_all("img")[0]["src"]
        except:
            continue

        if not img_url:
            # if img does not contain src attribute then skip it
            continue

        # few images contain HTTP GET key value pairs, e.g. '/image.png?c=3.2.5'
        # we will remove the keys
        try:
            pos = img_url.index("?")
            img_url = img_url[:pos]
        except ValueError:
            pass

        if is_valid(img_url):
            urls.append(img_url)
    return urls

def main(args):
    # get all images
    imgs = get_all_images(args.url)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Download images from a given URL."
    )
    parser.add_argument("--url", type=str,
                        help="a URL from where to download images",
                        default="https://unsplash.com/s/photos/door")
    parser.add_argument("--path", type=str,
                        help="path to store the downloaded images locally",
                        default="Photos/Doors")
    args = parser.parse_args()
    main(args)

The HTML of the webpage is such that within the tag figure are present img. The images of interest are sharing class _2VWD4 _2zEKz. The number of pagedowns is hardly having any impact as the number of images downloaded everytime is random. Is there flaw in my way of scraping the web content using BeautifulSoup? I wish to download roughly 5000 images from both the categories.

I have referenced this link for scrolling the webpage.

The website from where I download images is unsplash.

Answers

Your job became much simpler now. The website makes an ajax call to load the data which returns a json with image links

import requests

page_no = 1
res = requests.get("https://unsplash.com/napi/search/photos?query=window&xp=&per_page=20&page={}".format(page_no))
print(res.json())

You can loop over the json to get the image urls and also you change the page numbers.

Output:

{'total': 52107, 'total_pages': 2606, 'results': [{'id': '4gRNmhGzYZE', 'created_at': '2018-05-26T12:54:44-04:00', 'updated_at': '2020-07-07T01:27:39-04:00', 'promoted_at': '2018-05-27T06:46:05-04:00', 'width': 4000, 'height': 6000, 'color': '#D9D9E5', 'description': None, 'alt_description': 'gray wooden windowpane', 'urls': {'raw': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}, 'links': {'self': 'https://api.unsplash.com/photos/4gRNmhGzYZE', 'html': 'https://unsplash.com/photos/4gRNmhGzYZE', 'download': 'https://unsplash.com/photos/4gRNmhGzYZE/download', 'download_location': 'https://api.unsplash.com/photos/4gRNmhGzYZE/download'}, 'categories': [], 'likes': 269, 'liked_by_user': False, 'current_user_collections': [], 'sponsorship': None, 'user': {'id': 'FkDy50M-Pvk', 'updated_at': '2020-07-28T15:30:30-04:00', 'username': 'slrncl', 'name': 'Nicolas Solerieu', 'first_name': 'Nicolas', 'last_name': 'Solerieu', 'twitter_username': None, 'portfolio_url': 'http://slrncl.com/', 'bio': '193x92x35 — Fujifilm\r\nMostly plants and landscape, I travel slow when I do.', 'location': 'San Francisco', 'links': {'self': 'https://api.unsplash.com/users/slrncl', 'html': 'https://unsplash.com/@slrncl', 'photos': 'https://api.unsplash.com/users/slrncl/photos', 'likes': 'https://api.unsplash.com/users/slrncl/likes', 'portfolio': 'https://api.unsplash.com/users/slrncl/portfolio', 'following': 'https://api.unsplash.com/users/slrncl/following', 'followers': 'https://api.unsplash.com/users/slrncl/followers'}, 'profile_image': {'small': 'https://images.unsplash.com/profile-1553381672056-e843a19634d1?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=32&w=32', 'medium': 'https://images.unsplash.com/profile-1553381672056-e843a19634d1?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=64&w=64', 'large': 'https://images.unsplash.com/profile-1553381672056-e843a19634d1?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=128&w=128'}, 'instagram_username': 'solerieunicolas', 'total_collections': 1, 'total_likes': 65, 'total_photos': 94, 'accepted_tos': True}, 'tags': [{'type': 'landing_page', 'title': 'window', 'source': {'ancestry': {'type': {'slug': 'wallpapers', 'pretty_slug': 'HD Wallpapers'}, 'category': {'slug': 'desktop', 'pretty_slug': 'Desktop'}, 'subcategory': {'slug': 'windows', 'pretty_slug': 'Windows'}}, 'title': 'HD Windows Wallpapers', 'subtitle': 'Download Free Windows Wallpapers', 'description': 'Choose from a curated selection of Windows wallpapers for your mobile and desktop screens. Always free on Unsplash.', 'meta_title': 'Windows Wallpapers: Free HD Download [500+ HQ] | Unsplash', 'meta_description': 'Choose from hundreds of free Windows wallpapers. Download HD wallpapers for free on Unsplash.', 'cover_photo': {'id': 'R9OS29xJb-8', 'created_at': '2017-07-13T19:38:01-04:00', 'updated_at': '2020-07-21T01:05:51-04:00', 'promoted_at': '2017-07-14T22:49:56-04:00', 'width': 3456, 'height': 2304, 'color': '#9DB2CD', 'description': 'Ergh Jebbi', 'alt_description': 'sand landscape', 'urls': {'raw': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'full': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'regular': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'small': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'thumb': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjQzMzEwfQ'}, 'links': {'self': 'https://api.unsplash.com/photos/R9OS29xJb-8', 'html': 'https://unsplash.com/photos/R9OS29xJb-8', 'download': 'https://unsplash.com/photos/R9OS29xJb-8/download', 'download_location': 'https://api.unsplash.com/photos/R9OS29xJb-8/download'}, 'categories': [], 'likes': 1460, 'liked_by_user': False, 'current_user_collections': [], 'sponsorship': None, 'user': {'id': 'zpgEV0k9XAA', 'updated_at': '2020-07-23T09:39:36-04:00', 'username': 'm______________e', 'name': 'Mark Eder', 'first_name': 'Mark', 'last_name': 'Eder', 'twitter_username': None, 'portfolio_url': 'http://www.markeder.photography', 'bio': None, 'location': 'Vienna', 'links': {'self': 'https://api.unsplash.com/users/m______________e', 'html': 'https://unsplash.com/@m______________e', 'photos': 'https://api.unsplash.com/users/m______________e/photos', 'likes': 'https://api.unsplash.com/users/m______________e/likes', 'portfolio': 'https://api.unsplash.com/users/m______________e/portfolio', 'following': 'https://api.unsplash.com/users/m______________e/following', 'followers': 'https://api.unsplash.com/users/m______________e/followers'}, 'profile_image': {'small': 'https://images.unsplash.com/profile-1488557507434-790fb0197775?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=32&w=32', 'medium': 'https://images.unsplash.com/profile-1488557507434-790fb0197775?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=64&w=64', 'large': 'https://images.unsplash.com/profile-1488557507434-790fb0197775?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=128&w=128'}, 'instagram_username': 'm_______________________e', 'total_collections': 0, 'total_likes': 19, 'total_photos': 14, 'accepted_tos': False}}}}, {'type': 'landing_page', 'title': 'transparent', 'source': {'ancestry': {'type': {'slug': 'backgrounds', 'pretty_slug': 'Backgrounds'}, 'category': {'slug': 'art', 'pretty_slug': 'Art'}, 'subcategory': {'slug': 'transparent', 'pretty_slug': 'Transparent'}}, 'title': 'Transparent Backgrounds', 'subtitle': 'Download free transparent background images', 'description': "Looking to keep your backgrounds simple? Choose a transparent background from Unsplash's catalogue of professional-quality, high resolution photos. Always free on Unsplash.", 'meta_title': '900+ Transparent Background Images: Download HD Backgrounds on Unsplash', 'meta_description': 'Choose from hundreds of free transparent backgrounds. Download beautiful, curated free backgrounds on Unsplash.', 'cover_photo': {'id': '3cWA3U8xb5w', 'created_at': '2017-06-07T01:54:27-04:00', 'updated_at': '2020-07-28T01:20:31-04:00', 'promoted_at': '2017-06-08T05:20:00-04:00', 'width': 3872, 'height': 2592, 'color': '#F4F4F6', 'description': 'Ice shapes the mountains.', 'alt_description': 'person holding clear glass panel', 'urls': {'raw': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1', 'full': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb', 'regular': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max', 'small': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max', 'thumb': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max'}, 'links

....
....
...

Beauty of the website is that it returns images with different resolutions.

import requests

page_no = 1
res = requests.get("https://unsplash.com/napi/search/photos?query=window&xp=&per_page=20&page={}".format(page_no))
data = res.json()

for row in data["results"]:
    print(row["urls"])
    print("--" * 10)

Output:

{'raw': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
....
...
Logo

Python社区为您提供最前沿的新闻资讯和知识内容

更多推荐