Facebook
From Augustus, 5 Months ago, written in Plain Text.
Embed
Download Paste or View Raw
Hits: 129
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4. import csv
  5.  
  6. BASE_URL = 'https://www.thedailysentry.net/2023/10/yexel-sebastian-nagsalita-investment.html'  # Replace with the actual base URL
  7.  
  8. def scrape_page(url):
  9.     response = requests.get(url)
  10.  
  11.     if response.status_code == 200:
  12.         soup = BeautifulSoup(response.text, 'html.parser')
  13.         article = soup.find('article')
  14.  
  15.         # Specify the elements to remove
  16.         elements_to_remove = article.find_all(['span', 'td', 'div'], style=['font-size: x-small;', 'text-align: center;'])
  17.         elements_to_remove += article.find_all('div', class_='separator', style='clear: both; text-align: center;')
  18.  
  19.         for element in elements_to_remove:
  20.             element.extract()
  21.  
  22.         # Extract text content and apply space after periods, question marks, and exclamation points
  23.         modified_content = re.sub(r'(?<=[.!?])', ' ', article.get_text(strip=True))
  24.  
  25.         # Remove text starting with "source: "
  26.          modified_c '', modified_content, flags=re.IGNORECASE)
  27.  
  28.         # Remove "***"
  29.          modified_c '')
  30.  
  31.         return modified_content.strip()  # Use strip to remove leading and trailing spaces
  32.  
  33.     else:
  34.         print(f"Failed to retrieve the page. Status code: {response.status_code}")
  35.         return None
  36.  
  37. def scrape_and_save_multiple_articles(start_url, num_articles, output_file):
  38.     with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
  39.         csv_writer = csv.writer(csvfile)
  40.         csv_writer.writerow(['URL', 'Content'])  # Write header
  41.  
  42.         for _ in range(num_articles):
  43.             print(f"Scraping article - {start_url}")
  44.  
  45.              c
  46.  
  47.             if content:
  48.                 csv_writer.writerow([start_url, content])
  49.  
  50.             # Get the URL of the next article
  51.             next_article_link = get_next_article_link(start_url)
  52.  
  53.             if next_article_link:
  54.                 start_url = next_article_link
  55.             else:
  56.                 print("No more articles to scrape.")
  57.                 break
  58.  
  59. def get_next_article_link(current_url):
  60.      resp
  61.  
  62.     if response.status_code == 200:
  63.         soup = BeautifulSoup(response.text, 'html.parser')
  64.         older_link = soup.find('a', class_='blog-pager-older-link')
  65.  
  66.         if older_link and 'href' in older_link.attrs:
  67.             return older_link['href']
  68.  
  69.     return None
  70.  
  71. if __name__ == "__main__":
  72.     start_url = BASE_URL  # Replace with the actual starting URL
  73.     num_articles_to_scrape = 1000  # Replace with the desired number of articles to scrape
  74.     output_csv_file = 'scraped_data.csv'  # Replace with the desired output CSV file name
  75.  
  76.     scrape_and_save_multiple_articles(start_url, num_articles_to_scrape, output_csv_file)
  77.