BeatifulSoup unable to load all images from a scrolling page
Answer a question
I am trying to create a dataset with images of doors and windows. For this I am interested in downloading images from one of the websites offering a vast collection. The only problem with the webpage is that to load more images I need to scroll down. Later, I will parse all the HTML content using BeautifulSoup, but I am able to download only handful of images.
import requests
import os
import argparse
import time
from tqdm import tqdm
from bs4 import BeautifulSoup as bs
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.firefox import GeckoDriverManager
def is_valid(url):
"""
Checks whether the 'url' is a valid URL.
"""
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_images(url):
"""
Returns all image URLs on a single 'url'
"""
options = Options()
options.headless = True
driver = webdriver.Firefox(options=options)
driver.get(url)
elem = driver.find_element_by_tag_name("body")
no_of_pagedowns = 40
while no_of_pagedowns:
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.2)
no_of_pagedowns -= 1
# beautiful soup allows parsing html of the webpage
soup = bs(driver.page_source, "html.parser")
driver.quit()
urls = []
for fig in tqdm(soup.find_all("figure"), "Extracting images"):
try:
img_url = fig.find_all("img")[0]["src"]
except:
continue
if not img_url:
# if img does not contain src attribute then skip it
continue
# few images contain HTTP GET key value pairs, e.g. '/image.png?c=3.2.5'
# we will remove the keys
try:
pos = img_url.index("?")
img_url = img_url[:pos]
except ValueError:
pass
if is_valid(img_url):
urls.append(img_url)
return urls
def main(args):
# get all images
imgs = get_all_images(args.url)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download images from a given URL."
)
parser.add_argument("--url", type=str,
help="a URL from where to download images",
default="https://unsplash.com/s/photos/door")
parser.add_argument("--path", type=str,
help="path to store the downloaded images locally",
default="Photos/Doors")
args = parser.parse_args()
main(args)
The HTML of the webpage is such that within the tag figure are present img. The images of interest are sharing class _2VWD4 _2zEKz. The number of pagedowns is hardly having any impact as the number of images downloaded everytime is random. Is there flaw in my way of scraping the web content using BeautifulSoup? I wish to download roughly 5000 images from both the categories.
I have referenced this link for scrolling the webpage.
The website from where I download images is unsplash.
Answers
Your job became much simpler now. The website makes an ajax call to load the data which returns a json with image links
import requests
page_no = 1
res = requests.get("https://unsplash.com/napi/search/photos?query=window&xp=&per_page=20&page={}".format(page_no))
print(res.json())
You can loop over the json to get the image urls and also you change the page numbers.
Output:
{'total': 52107, 'total_pages': 2606, 'results': [{'id': '4gRNmhGzYZE', 'created_at': '2018-05-26T12:54:44-04:00', 'updated_at': '2020-07-07T01:27:39-04:00', 'promoted_at': '2018-05-27T06:46:05-04:00', 'width': 4000, 'height': 6000, 'color': '#D9D9E5', 'description': None, 'alt_description': 'gray wooden windowpane', 'urls': {'raw': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}, 'links': {'self': 'https://api.unsplash.com/photos/4gRNmhGzYZE', 'html': 'https://unsplash.com/photos/4gRNmhGzYZE', 'download': 'https://unsplash.com/photos/4gRNmhGzYZE/download', 'download_location': 'https://api.unsplash.com/photos/4gRNmhGzYZE/download'}, 'categories': [], 'likes': 269, 'liked_by_user': False, 'current_user_collections': [], 'sponsorship': None, 'user': {'id': 'FkDy50M-Pvk', 'updated_at': '2020-07-28T15:30:30-04:00', 'username': 'slrncl', 'name': 'Nicolas Solerieu', 'first_name': 'Nicolas', 'last_name': 'Solerieu', 'twitter_username': None, 'portfolio_url': 'http://slrncl.com/', 'bio': '193x92x35 — Fujifilm\r\nMostly plants and landscape, I travel slow when I do.', 'location': 'San Francisco', 'links': {'self': 'https://api.unsplash.com/users/slrncl', 'html': 'https://unsplash.com/@slrncl', 'photos': 'https://api.unsplash.com/users/slrncl/photos', 'likes': 'https://api.unsplash.com/users/slrncl/likes', 'portfolio': 'https://api.unsplash.com/users/slrncl/portfolio', 'following': 'https://api.unsplash.com/users/slrncl/following', 'followers': 'https://api.unsplash.com/users/slrncl/followers'}, 'profile_image': {'small': 'https://images.unsplash.com/profile-1553381672056-e843a19634d1?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=32&w=32', 'medium': 'https://images.unsplash.com/profile-1553381672056-e843a19634d1?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=64&w=64', 'large': 'https://images.unsplash.com/profile-1553381672056-e843a19634d1?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=128&w=128'}, 'instagram_username': 'solerieunicolas', 'total_collections': 1, 'total_likes': 65, 'total_photos': 94, 'accepted_tos': True}, 'tags': [{'type': 'landing_page', 'title': 'window', 'source': {'ancestry': {'type': {'slug': 'wallpapers', 'pretty_slug': 'HD Wallpapers'}, 'category': {'slug': 'desktop', 'pretty_slug': 'Desktop'}, 'subcategory': {'slug': 'windows', 'pretty_slug': 'Windows'}}, 'title': 'HD Windows Wallpapers', 'subtitle': 'Download Free Windows Wallpapers', 'description': 'Choose from a curated selection of Windows wallpapers for your mobile and desktop screens. Always free on Unsplash.', 'meta_title': 'Windows Wallpapers: Free HD Download [500+ HQ] | Unsplash', 'meta_description': 'Choose from hundreds of free Windows wallpapers. Download HD wallpapers for free on Unsplash.', 'cover_photo': {'id': 'R9OS29xJb-8', 'created_at': '2017-07-13T19:38:01-04:00', 'updated_at': '2020-07-21T01:05:51-04:00', 'promoted_at': '2017-07-14T22:49:56-04:00', 'width': 3456, 'height': 2304, 'color': '#9DB2CD', 'description': 'Ergh Jebbi', 'alt_description': 'sand landscape', 'urls': {'raw': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'full': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'regular': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'small': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjQzMzEwfQ', 'thumb': 'https://images.unsplash.com/photo-1499988921418-b7df40ff03f9?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjQzMzEwfQ'}, 'links': {'self': 'https://api.unsplash.com/photos/R9OS29xJb-8', 'html': 'https://unsplash.com/photos/R9OS29xJb-8', 'download': 'https://unsplash.com/photos/R9OS29xJb-8/download', 'download_location': 'https://api.unsplash.com/photos/R9OS29xJb-8/download'}, 'categories': [], 'likes': 1460, 'liked_by_user': False, 'current_user_collections': [], 'sponsorship': None, 'user': {'id': 'zpgEV0k9XAA', 'updated_at': '2020-07-23T09:39:36-04:00', 'username': 'm______________e', 'name': 'Mark Eder', 'first_name': 'Mark', 'last_name': 'Eder', 'twitter_username': None, 'portfolio_url': 'http://www.markeder.photography', 'bio': None, 'location': 'Vienna', 'links': {'self': 'https://api.unsplash.com/users/m______________e', 'html': 'https://unsplash.com/@m______________e', 'photos': 'https://api.unsplash.com/users/m______________e/photos', 'likes': 'https://api.unsplash.com/users/m______________e/likes', 'portfolio': 'https://api.unsplash.com/users/m______________e/portfolio', 'following': 'https://api.unsplash.com/users/m______________e/following', 'followers': 'https://api.unsplash.com/users/m______________e/followers'}, 'profile_image': {'small': 'https://images.unsplash.com/profile-1488557507434-790fb0197775?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=32&w=32', 'medium': 'https://images.unsplash.com/profile-1488557507434-790fb0197775?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=64&w=64', 'large': 'https://images.unsplash.com/profile-1488557507434-790fb0197775?ixlib=rb-1.2.1&q=80&fm=jpg&crop=faces&cs=tinysrgb&fit=crop&h=128&w=128'}, 'instagram_username': 'm_______________________e', 'total_collections': 0, 'total_likes': 19, 'total_photos': 14, 'accepted_tos': False}}}}, {'type': 'landing_page', 'title': 'transparent', 'source': {'ancestry': {'type': {'slug': 'backgrounds', 'pretty_slug': 'Backgrounds'}, 'category': {'slug': 'art', 'pretty_slug': 'Art'}, 'subcategory': {'slug': 'transparent', 'pretty_slug': 'Transparent'}}, 'title': 'Transparent Backgrounds', 'subtitle': 'Download free transparent background images', 'description': "Looking to keep your backgrounds simple? Choose a transparent background from Unsplash's catalogue of professional-quality, high resolution photos. Always free on Unsplash.", 'meta_title': '900+ Transparent Background Images: Download HD Backgrounds on Unsplash', 'meta_description': 'Choose from hundreds of free transparent backgrounds. Download beautiful, curated free backgrounds on Unsplash.', 'cover_photo': {'id': '3cWA3U8xb5w', 'created_at': '2017-06-07T01:54:27-04:00', 'updated_at': '2020-07-28T01:20:31-04:00', 'promoted_at': '2017-06-08T05:20:00-04:00', 'width': 3872, 'height': 2592, 'color': '#F4F4F6', 'description': 'Ice shapes the mountains.', 'alt_description': 'person holding clear glass panel', 'urls': {'raw': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1', 'full': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb', 'regular': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max', 'small': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max', 'thumb': 'https://images.unsplash.com/photo-1496814801204-280c839ea15a?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max'}, 'links
....
....
...
Beauty of the website is that it returns images with different resolutions.
import requests
page_no = 1
res = requests.get("https://unsplash.com/napi/search/photos?query=window&xp=&per_page=20&page={}".format(page_no))
data = res.json()
for row in data["results"]:
print(row["urls"])
print("--" * 10)
Output:
{'raw': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1527352774566-e4916e36c645?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1509644851169-2acc08aa25b5?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1484068043587-e86f6124d2ee?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1473252812967-d565c3607e28?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
{'raw': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&ixid=eyJhcHBfaWQiOjEyMDd9', 'full': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=85&fm=jpg&crop=entropy&cs=srgb&ixid=eyJhcHBfaWQiOjEyMDd9', 'regular': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=1080&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'small': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=400&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9', 'thumb': 'https://images.unsplash.com/photo-1520971434558-0b482cb0013f?ixlib=rb-1.2.1&q=80&fm=jpg&crop=entropy&cs=tinysrgb&w=200&fit=max&ixid=eyJhcHBfaWQiOjEyMDd9'}
--------------------
....
...
更多推荐

所有评论(0)