Answer a question

I have a small script that fetches company data from a website. This website gets regularly updated with new company information. How can I update my csv with new records on a periodic basis? Also as you can see in the code I have used an explicit range for the pages, what other solutions are possible?

The following is the code -

from selenium.webdriver import Firefox
from selenium.webdriver.common.keys import Keys 
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
import csv


#navigate to the ystory companies page 

#start collecting data from ystory 

START_URL = 'https://yourstory.com/companies/search?page=1&hitsPerPage=30'

#when the collection populates 30 elements then click on next page 


class CompDeetz():

    def __init__(self):
        self.browser = Firefox()
        self.browser.get(START_URL)
        sleep(20)
        self.browser.find_element_by_xpath('/html/body/div[12]/div/div/button').click()
        sleep(5)
        self.browser.find_element_by_xpath('/html/body/div[1]/div[4]').click()
        self.database = []



    def write_row(self,record):

        with open('test.csv', 'a') as t:
            writer = csv.writer(t)
            writer.writerows(record)
    


    def get_everything(self):

        all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[@class="hit"]')]
        all_records = []
        for company in all_list:

            record = company.split('\n')
            all_records.append(record)
    

        self.write_row(all_records)



    def next_page(self):

        self.browser.find_element_by_xpath('//ul[@class="ais-Pagination-list"]/li[7]/a').click()
        sleep(20)



def main():
    t = CompDeetz()
    t.get_everything()
    for i in range(33):
        t.next_page()
        t.get_everything()


if __name__ == "__main__":
    main()

Answers

Instead of having two different methods get_everything and next_page and calling them multiple times. You can have one method get_everything and call it once. def get_everything(self):

  all_records = []
  nextPage = True
  while nextPage:

       all_list = [ (a.text) for a in self.browser.find_elements_by_xpath('//tr[@class="hit"]')]
       for company in all_list:
          record = company.split('\n')
          all_records.append(record)
     
        try:
          nextPagelink = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//a[@aria-label='Next page']")))
          driver.execute_script("arguments[0].scrollIntoView();", nextPagelink)
          driver.execute_script("arguments[0].click();", nextPagelink)
          time.sleep(5) # for next [age to load
        #As on last page, next page link is not available. It will throw exception
       except NoSuchElementException:
        nextpage = False


self.write_row(all_records)

Note : take care of Pop up coming on page. I hope you already have mechanism to handle it.

Logo

Python社区为您提供最前沿的新闻资讯和知识内容

更多推荐