Untitled

From karo, 7 Months ago, written in Python.

Embed

Download Paste or View Raw
Hits: 392

### Browse through all the 10 pages on the web-site

## https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=1

## collect/print all the hyperlinks that lead to each individual profile in terminal/CSV

## Concatenate the collected urls and go through each profile (you can create another program, by just looping through the list)

## Collect the below information (if existent on each profile) and print it in CSV file

## name, DOB, nationality, address,start and end date, case referrence

## Desired output:

## profileUrl + ‘!’ + name + ‘!’ + dob + ‘!’ + nationality + ‘!’ + address + ‘!’ + startDate + ‘!’ + endDate + ‘!’ + caseReference + ‘n’

from selenium import webdriver

from selenium.webdriver.common.by import By

import time

import csv

from selenium.common.exceptions import NoSuchElementException

driver = webdriver.Chrome()

# Creating/Opening a file to write te data in. Please change the pasth to the correct csv file on your computer

with open(r'C:UsersKarolina.GugudisDesktopFind and update company info.csv', 'w', encoding="utf-8") as f: # w is for writing a new file

f.write('profilesurls n')

driver.get('https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A')

driver.maximize_window()

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

time.sleep(5)

urls = ['https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=1',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=2',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=3',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=4',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=5',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=6',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=7',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=8',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=9',

'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=10'

]

for link in urls:

driver.get(link)

time.sleep(3)

for i in range(2, 52): # the hyperlink of the names to each profile

x = '//*[@id="search-container"]/div[1]/table/tbody/tr['

x = x + str(i) + ']/td[1]/a'

# //*[@id="search-container"]/div[1]/table/tbody/tr[2]/td[1]/a

# //*[@id="search-container"]/div[1]/table/tbody/tr[3]/td[1]/a

# //*[@id="search-container"]/div[1]/table/tbody/tr[4]/td[1]/a

# //*[@id="search-container"]/div[1]/table/tbody/tr[5]/td[1]/a

# //*[@id="search-container"]/div[1]/table/tbody/tr[51]/td[1]/a

# //*[@id="search-container"]/div[1]/table/tbody/tr[25]/td[1]/a - last page # 10

try:

profileurl = driver.find_element(By.XPATH,x).get_attribute('href')

except NoSuchElementException:

profileurl = 'no name'

print(profileurl)

with open(r'C:UsersKarolina.GugudisDesktopFind and update company info.csv', 'a', encoding="utf-8") as f: # a is for appending information to the file

f.write(profileurl + 'n')

print('the job is done')

Author

Title

Language

Your paste - Paste your paste here

### Browse through all the 10 pages on the web-site 
## https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=1
## collect/print all the hyperlinks that lead to each individual profile in terminal/CSV
## Concatenate the collected urls and go through each profile &amp;#40;you can create another program, by just looping through the list&amp;#41; 
## Collect the below information (if existent on each profile) and print it in CSV file
## name, DOB, nationality, address,start and end date, case referrence

## Desired output:
## profileUrl + ‘!’ + name + ‘!’ + dob + ‘!’ + nationality + ‘!’ + address + ‘!’  + startDate + ‘!’ + endDate  + ‘!’ + caseReference + ‘n’

from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import csv
from selenium.common.exceptions import NoSuchElementException
 
driver = webdriver.Chrome()

# Creating/Opening a file to write te data in. Please change the pasth to the correct csv file on your computer
with open(r'C:UsersKarolina.GugudisDesktopFind and update company info.csv', 'w', encoding=&quot;utf-8&quot;) as f:  # w is for writing a new file
    f.write('profilesurls n')

driver.get('https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A')
driver.maximize_window()
driver.execute_script(&quot;window.scrollTo(0, document.body.scrollHeight);&quot;)
time.sleep(5)

urls = ['https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=1',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=2',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=3',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=4',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=5',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=6',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=7',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=8',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=9',
        'https://find-and-update.company-information.service.gov.uk/register-of-disqualifications/A?page=10'
        ]

for link in urls:
    driver.get(link)
    time.sleep(3)

for i in range(2, 52): # the hyperlink of the names to each profile
        x = '//*[@id=&quot;search-container&quot;]/div[1]/table/tbody/tr['
        x = x + str(i) + ']/td[1]/a'
        # //*[@id=&quot;search-container&quot;]/div[1]/table/tbody/tr[2]/td[1]/a
        # //*[@id=&quot;search-container&quot;]/div[1]/table/tbody/tr[3]/td[1]/a
        # //*[@id=&quot;search-container&quot;]/div[1]/table/tbody/tr[4]/td[1]/a
        # //*[@id=&quot;search-container&quot;]/div[1]/table/tbody/tr[5]/td[1]/a
        # //*[@id=&quot;search-container&quot;]/div[1]/table/tbody/tr[51]/td[1]/a
        # //*[@id=&quot;search-container&quot;]/div[1]/table/tbody/tr[25]/td[1]/a - last page # 10

try:
            profileurl = driver.find_element(By.XPATH,x).get_attribute('href')  
        except NoSuchElementException:
            profileurl = 'no name'
        print(profileurl)

with open(r'C:UsersKarolina.GugudisDesktopFind and update company info.csv', 'a', encoding=&quot;utf-8&quot;) as f: # a is for appending information to the file
            f.write(profileurl + 'n')
    
print('the job is done')

Private - Private paste aren't shown in recent listings.

Delete After - When should we delete your paste?

Spam protection -

{"html5":"htmlmixed","css":"css","javascript":"javascript","php":"php","python":"python","ruby":"ruby","lua":"text\/x-lua","bash":"text\/x-sh","go":"go","c":"text\/x-csrc","cpp":"text\/x-c++src","diff":"diff","latex":"stex","sql":"sql","xml":"xml","apl":"apl","asterisk":"asterisk","c_loadrunner":"text\/x-csrc","c_mac":"text\/x-csrc","coffeescript":"text\/x-coffeescript","csharp":"text\/x-csharp","d":"d","ecmascript":"javascript","erlang":"erlang","groovy":"text\/x-groovy","haskell":"text\/x-haskell","haxe":"text\/x-haxe","html4strict":"htmlmixed","java":"text\/x-java","java5":"text\/x-java","jquery":"javascript","mirc":"mirc","mysql":"sql","ocaml":"text\/x-ocaml","pascal":"text\/x-pascal","perl":"perl","perl6":"perl","plsql":"sql","properties":"text\/x-properties","q":"text\/x-q","scala":"scala","scheme":"text\/x-scheme","tcl":"text\/x-tcl","vb":"text\/x-vb","verilog":"text\/x-verilog","yaml":"text\/x-yaml","z80":"text\/x-z80"}

Reply to "Untitled"