import requests from bs4 import BeautifulSoup import re import csv BASE_URL = 'https://www.thedailysentry.net/2023/10/yexel-sebastian-nagsalita-investment.html' # Replace with the actual base URL def scrape_page(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') article = soup.find('article') # Specify the elements to remove elements_to_remove = article.find_all(['span', 'td', 'div'], style=['font-size: x-small;', 'text-align: center;']) elements_to_remove += article.find_all('div', class_='separator', style='clear: both; text-align: center;') for element in elements_to_remove: element.extract() # Extract text content and apply space after periods, question marks, and exclamation points modified_content = re.sub(r'(?<=[.!?])', ' ', article.get_text(strip=True)) # Remove text starting with "source: " modified_c '', modified_content, flags=re.IGNORECASE) # Remove "***" modified_c '') return modified_content.strip() # Use strip to remove leading and trailing spaces else: print(f"Failed to retrieve the page. Status code: {response.status_code}") return None def scrape_and_save_multiple_articles(start_url, num_articles, output_file): with open(output_file, 'w', newline='', encoding='utf-8') as csvfile: csv_writer = csv.writer(csvfile) csv_writer.writerow(['URL', 'Content']) # Write header for _ in range(num_articles): print(f"Scraping article - {start_url}") c if content: csv_writer.writerow([start_url, content]) # Get the URL of the next article next_article_link = get_next_article_link(start_url) if next_article_link: start_url = next_article_link else: print("No more articles to scrape.") break def get_next_article_link(current_url): resp if response.status_code == 200: soup = BeautifulSoup(response.text, 'html.parser') older_link = soup.find('a', class_='blog-pager-older-link') if older_link and 'href' in older_link.attrs: return older_link['href'] return None if __name__ == "__main__": start_url = BASE_URL # Replace with the actual starting URL num_articles_to_scrape = 1000 # Replace with the desired number of articles to scrape output_csv_file = 'scraped_data.csv' # Replace with the desired output CSV file name scrape_and_save_multiple_articles(start_url, num_articles_to_scrape, output_csv_file)