Facebook
From karo, 7 Months ago, written in Python.
Embed
Download Paste or View Raw
Hits: 392
  1. ### Browse through all the 10 pages on the web-site
  2. ## https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=1
  3. ## collect/print all the hyperlinks that lead to each individual profile in terminal/CSV
  4. ## Concatenate the collected urls and go through each profile (you can create another program, by just looping through the list)
  5. ## Collect the below information (if existent on each profile) and print it in CSV file
  6. ## name, DOB, nationality, address,start and end date, case referrence
  7.  
  8. ## Desired output:
  9. ## profileUrl + ‘!’ + name + ‘!’ + dob + ‘!’ + nationality + ‘!’ + address + ‘!’  + startDate + ‘!’ + endDate  + ‘!’ + caseReference + ‘n’
  10.  
  11. from selenium import webdriver
  12. from selenium.webdriver.common.by import By
  13. import time
  14. import csv
  15. from selenium.common.exceptions import NoSuchElementException
  16.  
  17. driver = webdriver.Chrome()
  18.  
  19. # Creating/Opening a file to write te data in. Please change the pasth to the correct csv file on your computer
  20. with open(r'C:UsersKarolina.GugudisDesktopFind and update company info.csv', 'w', encoding="utf-8") as f:  # w is for writing a new file
  21.     f.write('profilesurls n')
  22.  
  23.  
  24. driver.get('https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A')
  25. driver.maximize_window()
  26. driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
  27. time.sleep(5)
  28.  
  29. urls = ['https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=1',
  30.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=2',
  31.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=3',
  32.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=4',
  33.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=5',
  34.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=6',
  35.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=7',
  36.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=8',
  37.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=9',
  38.         'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=10'
  39.         ]
  40.  
  41. for link in urls:
  42.     driver.get(link)
  43.     time.sleep(3)
  44.  
  45.     for i in range(2, 52): # the hyperlink of the names to each profile
  46.         x = '//*[@id="search-container"]/div[1]/table/tbody/tr['
  47.         x = x + str(i) + ']/td[1]/a'
  48.         # //*[@id="search-container"]/div[1]/table/tbody/tr[2]/td[1]/a
  49.         # //*[@id="search-container"]/div[1]/table/tbody/tr[3]/td[1]/a
  50.         # //*[@id="search-container"]/div[1]/table/tbody/tr[4]/td[1]/a
  51.         # //*[@id="search-container"]/div[1]/table/tbody/tr[5]/td[1]/a
  52.         # //*[@id="search-container"]/div[1]/table/tbody/tr[51]/td[1]/a
  53.         # //*[@id="search-container"]/div[1]/table/tbody/tr[25]/td[1]/a - last page # 10
  54.  
  55.         try:
  56.             profileurl = driver.find_element(By.XPATH,x).get_attribute('href')  
  57.         except NoSuchElementException:
  58.             profileurl = 'no name'
  59.         print(profileurl)
  60.  
  61.         with open(r'C:UsersKarolina.GugudisDesktopFind and update company info.csv', 'a', encoding="utf-8") as f: # a is for appending information to the file
  62.             f.write(profileurl + 'n')
  63.    
  64. print('the job is done')
  65.