Web Scraping

From Augustus, 5 Months ago, written in Plain Text.

Embed

Download Paste or View Raw
Hits: 129

import requests

from bs4 import BeautifulSoup

import re

import csv

BASE_URL = 'https://www.thedailysentry.net/2023/10/yexel-sebastian-nagsalita-investment.html' # Replace with the actual base URL

def scrape_page(url):

response = requests.get(url)

if response.status_code == 200:

soup = BeautifulSoup(response.text, 'html.parser')

article = soup.find('article')

# Specify the elements to remove

elements_to_remove = article.find_all(['span', 'td', 'div'], style=['font-size: x-small;', 'text-align: center;'])

elements_to_remove += article.find_all('div', class_='separator', style='clear: both; text-align: center;')

for element in elements_to_remove:

element.extract()

# Extract text content and apply space after periods, question marks, and exclamation points

modified_content = re.sub(r'(?<=[.!?])', ' ', article.get_text(strip=True))

# Remove text starting with "source: "

modified_c '', modified_content, flags=re.IGNORECASE)

# Remove "***"

modified_c '')

return modified_content.strip() # Use strip to remove leading and trailing spaces

else:

print(f"Failed to retrieve the page. Status code: {response.status_code}")

return None

def scrape_and_save_multiple_articles(start_url, num_articles, output_file):

with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:

csv_writer = csv.writer(csvfile)

csv_writer.writerow(['URL', 'Content']) # Write header

for _ in range(num_articles):

print(f"Scraping article - {start_url}")

c

if content:

csv_writer.writerow([start_url, content])

# Get the URL of the next article

next_article_link = get_next_article_link(start_url)

if next_article_link:

start_url = next_article_link

else:

print("No more articles to scrape.")

break

def get_next_article_link(current_url):

resp

if response.status_code == 200:

soup = BeautifulSoup(response.text, 'html.parser')

older_link = soup.find('a', class_='blog-pager-older-link')

if older_link and 'href' in older_link.attrs:

return older_link['href']

return None

if __name__ == "__main__":

start_url = BASE_URL # Replace with the actual starting URL

num_articles_to_scrape = 1000 # Replace with the desired number of articles to scrape

output_csv_file = 'scraped_data.csv' # Replace with the desired output CSV file name

scrape_and_save_multiple_articles(start_url, num_articles_to_scrape, output_csv_file)

Author

Title

Language

Your paste - Paste your paste here

import requests
from bs4 import BeautifulSoup
import re
import csv

BASE_URL = 'https://www.thedailysentry.net/2023/10/yexel-sebastian-nagsalita-investment.html'  # Replace with the actual base URL

def scrape_page(url):
    response = requests.get(url)

if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        article = soup.find('article')

# Specify the elements to remove
        elements_to_remove = article.find_all(['span', 'td', 'div'], style=['font-size: x-small;', 'text-align: center;'])
        elements_to_remove += article.find_all('div', class_='separator', style='clear: both; text-align: center;')

for element in elements_to_remove:
            element.extract()

# Extract text content and apply space after periods, question marks, and exclamation points
        modified_content = re.sub(r'(?&lt;=[.!?])', ' ', article.get_text(strip=True))

# Remove text starting with &quot;source: &quot;
         modified_c '', modified_content, flags=re.IGNORECASE)

# Remove &quot;***&quot;
         modified_c '')

return modified_content.strip()  # Use strip to remove leading and trailing spaces

else:
        print(f&quot;Failed to retrieve the page. Status code: {response.status_code}&quot;)
        return None

def scrape_and_save_multiple_articles(start_url, num_articles, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['URL', 'Content'])  # Write header

for _ in range(num_articles):
            print(f&quot;Scraping article - {start_url}&quot;)

if content:
                csv_writer.writerow([start_url, content])

# Get the URL of the next article
            next_article_link = get_next_article_link(start_url)

if next_article_link:
                start_url = next_article_link
            else:
                print(&quot;No more articles to scrape.&quot;)
                break

def get_next_article_link(current_url):
     resp

if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        older_link = soup.find('a', class_='blog-pager-older-link')

if older_link and 'href' in older_link.attrs:
            return older_link['href']

return None

if __name__ == &quot;__main__&quot;:
    start_url = BASE_URL  # Replace with the actual starting URL
    num_articles_to_scrape = 1000  # Replace with the desired number of articles to scrape
    output_csv_file = 'scraped_data.csv'  # Replace with the desired output CSV file name

scrape_and_save_multiple_articles(start_url, num_articles_to_scrape, output_csv_file)

Private - Private paste aren't shown in recent listings.

Delete After - When should we delete your paste?

Spam protection -

{"html5":"htmlmixed","css":"css","javascript":"javascript","php":"php","python":"python","ruby":"ruby","lua":"text\/x-lua","bash":"text\/x-sh","go":"go","c":"text\/x-csrc","cpp":"text\/x-c++src","diff":"diff","latex":"stex","sql":"sql","xml":"xml","apl":"apl","asterisk":"asterisk","c_loadrunner":"text\/x-csrc","c_mac":"text\/x-csrc","coffeescript":"text\/x-coffeescript","csharp":"text\/x-csharp","d":"d","ecmascript":"javascript","erlang":"erlang","groovy":"text\/x-groovy","haskell":"text\/x-haskell","haxe":"text\/x-haxe","html4strict":"htmlmixed","java":"text\/x-java","java5":"text\/x-java","jquery":"javascript","mirc":"mirc","mysql":"sql","ocaml":"text\/x-ocaml","pascal":"text\/x-pascal","perl":"perl","perl6":"perl","plsql":"sql","properties":"text\/x-properties","q":"text\/x-q","scala":"scala","scheme":"text\/x-scheme","tcl":"text\/x-tcl","vb":"text\/x-vb","verilog":"text\/x-verilog","yaml":"text\/x-yaml","z80":"text\/x-z80"}

Reply to "Web Scraping"