from bs4 import BeautifulSoup
import json
from time import sleep
import time
import requests
from random import randint
from html.parser import HTMLParser
USER_AGENT = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
class SearchEngine:
@staticmethod
def search(query, sleep=True):
if sleep:
time.sleep(randint(10, 100))
temp_url = '+'.join(query.split())
url = 'http://www.ask.com/web?q=' + temp_url
soup = BeautifulSoup(requests.get(url, headers=USER_AGENT).text, "html.parser")
new_results = SearchEngine.scrape_search_result(soup)
return new_results
@staticmethod
def scrape_search_result(soup):
raw_results = soup.find_all("div",attrs = {"class" : "PartialSearchResults-item-title"})
results = set()
for result in raw_results:
for r in result.find_all("a"):
if(len(results) < 10):
results.add(r.attrs["href"])
final_results = list(results)
return final_results
if __name__ == '__main__':
f = open("100QueriesSet3.txt", "r")
file = f.readlines()
dictionary = {}
for i in file:
key = i
query = SearchEngine.search(key)
dictionary[key] = query
with open("hw1.json", "w") as outfile:
json.dump(dictionary, outfile, indent=4)
{"html5":"htmlmixed","css":"css","javascript":"javascript","php":"php","python":"python","ruby":"ruby","lua":"text\/x-lua","bash":"text\/x-sh","go":"go","c":"text\/x-csrc","cpp":"text\/x-c++src","diff":"diff","latex":"stex","sql":"sql","xml":"xml","apl":"apl","asterisk":"asterisk","c_loadrunner":"text\/x-csrc","c_mac":"text\/x-csrc","coffeescript":"text\/x-coffeescript","csharp":"text\/x-csharp","d":"d","ecmascript":"javascript","erlang":"erlang","groovy":"text\/x-groovy","haskell":"text\/x-haskell","haxe":"text\/x-haxe","html4strict":"htmlmixed","java":"text\/x-java","java5":"text\/x-java","jquery":"javascript","mirc":"mirc","mysql":"sql","ocaml":"text\/x-ocaml","pascal":"text\/x-pascal","perl":"perl","perl6":"perl","plsql":"sql","properties":"text\/x-properties","q":"text\/x-q","scala":"scala","scheme":"text\/x-scheme","tcl":"text\/x-tcl","vb":"text\/x-vb","verilog":"text\/x-verilog","yaml":"text\/x-yaml","z80":"text\/x-z80"}