from bs4 import BeautifulSoup import json from time import sleep import time import requests from random import randint from html.parser import HTMLParser USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'} class SearchEngine: @staticmethod def search(query, sleep=True): if sleep: time.sleep(randint(10, 100)) temp_url = '+'.join(query.split()) url = 'http://www.ask.com/web?q=' + temp_url soup = BeautifulSoup(requests.get(url, headers=USER_AGENT).text, "html.parser") new_results = SearchEngine.scrape_search_result(soup) return new_results @staticmethod def scrape_search_result(soup): raw_results = soup.find_all("div",attrs = {"class" : "PartialSearchResults-item-title"}) results = set() for result in raw_results: for r in result.find_all("a"): if(len(results) < 10): results.add(r.attrs["href"]) final_results = list(results) return final_results if __name__ == '__main__': f = open("100QueriesSet3.txt", "r") file = f.readlines() dictionary = {} for i in file: key = i query = SearchEngine.search(key) dictionary[key] = query with open("hw1.json", "w") as outfile: json.dump(dictionary, outfile, indent=4)