How to periodically fetch records from a website using selenium?
·
Answer a question
I have a small script that fetches company data from a website. This website gets regularly updated with new company information. How can I update my csv with new records on a periodic basis? Also as you can see in the code I have used an explicit range for the pages, what other solutions are possible?
The following is the code -
from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import csv
#navigate to the ystory companies page
#start collecting data from ystory
START_URL = 'https://yourstory.com/companies/search?page=1&hitsPerPage=30'
#when the collection populates 30 elements then click on next page
class CompDeetz():
def __init__(self):
self.browser = Firefox()
self.browser.get(START_URL)
sleep(20)
self.browser.find_element_by_xpath('/html/body/div[12]/div/div/button').click()
sleep(5)
self.browser.find_element_by_xpath('/html/body/div[1]/div[4]').click()
self.database = []
def write_row(self,record):
with open('test.csv', 'a') as t:
writer = csv.writer(t)
writer.writerows(record)
def get_everything(self):
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[@class="hit"]')]
all_records = []
for company in all_list:
record = company.split('\n')
all_records.append(record)
self.write_row(all_records)
def next_page(self):
self.browser.find_element_by_xpath('//ul[@class="ais-Pagination-list"]/li[7]/a').click()
sleep(20)
def main():
t = CompDeetz()
t.get_everything()
for i in range(33):
t.next_page()
t.get_everything()
if __name__ == "__main__":
main()
Answers
Instead of having two different methods get_everything and next_page and calling them multiple times. You can have one method get_everything and call it once. def get_everything(self):
all_records = []
nextPage = True
while nextPage:
all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[@class="hit"]')]
for company in all_list:
record = company.split('\n')
all_records.append(record)
try:
nextPagelink = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[@aria-label='Next page']")))
driver.execute_script("arguments[0].scrollIntoView();", nextPagelink)
driver.execute_script("arguments[0].click();", nextPagelink)
time.sleep(5) # for next [age to load
#As on last page, next page link is not available. It will throw exception
except NoSuchElementException:
nextpage = False
self.write_row(all_records)
Note : take care of Pop up coming on page. I hope you already have mechanism to handle it.
更多推荐

所有评论(0)