- import requests
- from bs4 import BeautifulSoup
- import re
- import csv
- BASE_URL = 'https://www.thedailysentry.net/2023/10/yexel-sebastian-nagsalita-investment.html' # Replace with the actual base URL
- def scrape_page(url):
- response = requests.get(url)
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- article = soup.find('article')
- # Specify the elements to remove
- elements_to_remove = article.find_all(['span', 'td', 'div'], style=['font-size: x-small;', 'text-align: center;'])
- elements_to_remove += article.find_all('div', class_='separator', style='clear: both; text-align: center;')
- for element in elements_to_remove:
- element.extract()
- # Extract text content and apply space after periods, question marks, and exclamation points
- modified_content = re.sub(r'(?<=[.!?])', ' ', article.get_text(strip=True))
- # Remove text starting with "source: "
- modified_c '', modified_content, flags=re.IGNORECASE)
- # Remove "***"
- modified_c '')
- return modified_content.strip() # Use strip to remove leading and trailing spaces
- else:
- print(f"Failed to retrieve the page. Status code: {response.status_code}")
- return None
- def scrape_and_save_multiple_articles(start_url, num_articles, output_file):
- with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
- csv_writer = csv.writer(csvfile)
- csv_writer.writerow(['URL', 'Content']) # Write header
- for _ in range(num_articles):
- print(f"Scraping article - {start_url}")
- c
- if content:
- csv_writer.writerow([start_url, content])
- # Get the URL of the next article
- next_article_link = get_next_article_link(start_url)
- if next_article_link:
- start_url = next_article_link
- else:
- print("No more articles to scrape.")
- break
- def get_next_article_link(current_url):
- resp
- if response.status_code == 200:
- soup = BeautifulSoup(response.text, 'html.parser')
- older_link = soup.find('a', class_='blog-pager-older-link')
- if older_link and 'href' in older_link.attrs:
- return older_link['href']
- return None
- if __name__ == "__main__":
- start_url = BASE_URL # Replace with the actual starting URL
- num_articles_to_scrape = 1000 # Replace with the desired number of articles to scrape
- output_csv_file = 'scraped_data.csv' # Replace with the desired output CSV file name
- scrape_and_save_multiple_articles(start_url, num_articles_to_scrape, output_csv_file)