Title Extraction Script

From Alperen, 6 Months ago, written in Plain Text.

Embed

Download Paste or View Raw
Hits: 179

import requests

from bs4 import BeautifulSoup

from urllib.parse import urlparse, urljoin

# Define the starting URL

start_url = 'https://tr.wikipedia.org/wiki/Fred_la_marmotte' # Replace with the URL you want to start from

# Initialize a list to store (URL, title) pairs

page_info = []

# Function to fetch and parse a web page

def fetch_and_parse(url):

try:

response = requests.get(url)

response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

title = soup.title.string if soup.title else 'No Title'

return title

except requests.exceptions.RequestException as e:

print(f"Error: {e}")

return None

# Function to visit links and fetch titles

def visit_links_and_fetch_titles(url):

try:

response = requests.get(url)

response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')

links = soup.find_all('a', href=True)

for link in links:

link_url = link['href']

if link_url.startswith('http') and 'web.archive.org' in link_url:

title = fetch_and_parse(link_url)

if title:

page_info.append((link_url, title))

except requests.exceptions.RequestException as e:

print(f"Error: {e}")

# Start the process

visit_links_and_fetch_titles(start_url)

# Print the (URL, title) pairs

print("URL and Title:")

for url, title in page_info:

print(f"URL: {url}")

print(f"Title: {title}\n")

Author

Title

Language

Your paste - Paste your paste here

import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

# Define the starting URL
start_url = 'https://tr.wikipedia.org/wiki/Fred_la_marmotte'  # Replace with the URL you want to start from

# Initialize a list to store (URL, title) pairs
page_info = []

# Function to fetch and parse a web page
def fetch_and_parse(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.title.string if soup.title else 'No Title'

return title
    except requests.exceptions.RequestException as e:
        print(f&quot;Error: {e}&quot;)
        return None

# Function to visit links and fetch titles
def visit_links_and_fetch_titles(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)

for link in links:
            link_url = link['href']
            if link_url.startswith('http') and 'web.archive.org' in link_url:
                title = fetch_and_parse(link_url)
                if title:
                    page_info.append((link_url, title))

except requests.exceptions.RequestException as e:
        print(f&quot;Error: {e}&quot;)

# Start the process
visit_links_and_fetch_titles(start_url)

# Print the (URL, title) pairs
print(&quot;URL and Title:&quot;)
for url, title in page_info:
    print(f&quot;URL: {url}&quot;)
    print(f&quot;Title: {title}\n&quot;)

Private - Private paste aren't shown in recent listings.

Delete After - When should we delete your paste?

Spam protection -

{"html5":"htmlmixed","css":"css","javascript":"javascript","php":"php","python":"python","ruby":"ruby","lua":"text\/x-lua","bash":"text\/x-sh","go":"go","c":"text\/x-csrc","cpp":"text\/x-c++src","diff":"diff","latex":"stex","sql":"sql","xml":"xml","apl":"apl","asterisk":"asterisk","c_loadrunner":"text\/x-csrc","c_mac":"text\/x-csrc","coffeescript":"text\/x-coffeescript","csharp":"text\/x-csharp","d":"d","ecmascript":"javascript","erlang":"erlang","groovy":"text\/x-groovy","haskell":"text\/x-haskell","haxe":"text\/x-haxe","html4strict":"htmlmixed","java":"text\/x-java","java5":"text\/x-java","jquery":"javascript","mirc":"mirc","mysql":"sql","ocaml":"text\/x-ocaml","pascal":"text\/x-pascal","perl":"perl","perl6":"perl","plsql":"sql","properties":"text\/x-properties","q":"text\/x-q","scala":"scala","scheme":"text\/x-scheme","tcl":"text\/x-tcl","vb":"text\/x-vb","verilog":"text\/x-verilog","yaml":"text\/x-yaml","z80":"text\/x-z80"}

Title Extraction Script

Reply to "Title Extraction Script"