import requests
from bs4 import BeautifulSoup
import re
import csv

BASE_URL = 'https://www.thedailysentry.net/2023/10/yexel-sebastian-nagsalita-investment.html'  # Replace with the actual base URL

def scrape_page(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        article = soup.find('article')

        # Specify the elements to remove
        elements_to_remove = article.find_all(['span', 'td', 'div'], style=['font-size: x-small;', 'text-align: center;'])
        elements_to_remove += article.find_all('div', class_='separator', style='clear: both; text-align: center;')

        for element in elements_to_remove:
            element.extract()

        # Extract text content and apply space after periods, question marks, and exclamation points
        modified_content = re.sub(r'(?<=[.!?])', ' ', article.get_text(strip=True))

        # Remove text starting with "source: "
         modified_c '', modified_content, flags=re.IGNORECASE)

        # Remove "***"
         modified_c '')

        return modified_content.strip()  # Use strip to remove leading and trailing spaces

    else:
        print(f"Failed to retrieve the page. Status code: {response.status_code}")
        return None

def scrape_and_save_multiple_articles(start_url, num_articles, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['URL', 'Content'])  # Write header

        for _ in range(num_articles):
            print(f"Scraping article - {start_url}")

             c

            if content:
                csv_writer.writerow([start_url, content])

            # Get the URL of the next article
            next_article_link = get_next_article_link(start_url)

            if next_article_link:
                start_url = next_article_link
            else:
                print("No more articles to scrape.")
                break

def get_next_article_link(current_url):
     resp

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        older_link = soup.find('a', class_='blog-pager-older-link')

        if older_link and 'href' in older_link.attrs:
            return older_link['href']

    return None

if __name__ == "__main__":
    start_url = BASE_URL  # Replace with the actual starting URL
    num_articles_to_scrape = 1000  # Replace with the desired number of articles to scrape
    output_csv_file = 'scraped_data.csv'  # Replace with the desired output CSV file name

    scrape_and_save_multiple_articles(start_url, num_articles_to_scrape, output_csv_file)