Facebook
From Alperen, 6 Months ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 179
  1. import requests
  2. from bs4 import BeautifulSoup
  3. from urllib.parse import urlparse, urljoin
  4.  
  5. # Define the starting URL
  6. start_url = 'https://tr.wikipedia.org/wiki/Fred_la_marmotte'  # Replace with the URL you want to start from
  7.  
  8. # Initialize a list to store (URL, title) pairs
  9. page_info = []
  10.  
  11. # Function to fetch and parse a web page
  12. def fetch_and_parse(url):
  13.     try:
  14.         response = requests.get(url)
  15.         response.raise_for_status()
  16.  
  17.         soup = BeautifulSoup(response.text, 'html.parser')
  18.         title = soup.title.string if soup.title else 'No Title'
  19.  
  20.         return title
  21.     except requests.exceptions.RequestException as e:
  22.         print(f"Error: {e}")
  23.         return None
  24.  
  25. # Function to visit links and fetch titles
  26. def visit_links_and_fetch_titles(url):
  27.     try:
  28.         response = requests.get(url)
  29.         response.raise_for_status()
  30.  
  31.         soup = BeautifulSoup(response.text, 'html.parser')
  32.         links = soup.find_all('a', href=True)
  33.  
  34.         for link in links:
  35.             link_url = link['href']
  36.             if link_url.startswith('http') and 'web.archive.org' in link_url:
  37.                 title = fetch_and_parse(link_url)
  38.                 if title:
  39.                     page_info.append((link_url, title))
  40.  
  41.     except requests.exceptions.RequestException as e:
  42.         print(f"Error: {e}")
  43.  
  44. # Start the process
  45. visit_links_and_fetch_titles(start_url)
  46.  
  47. # Print the (URL, title) pairs
  48. print("URL and Title:")
  49. for url, title in page_info:
  50.     print(f"URL: {url}")
  51.     print(f"Title: {title}\n")
  52.