Facebook
From Funky Cassowary, 1 Year ago, written in Python.
Embed
Download Paste or View Raw
Hits: 148
  1. # -*- coding: utf-8 -*-
  2. import argparse
  3. import requests
  4. import re
  5. from bs4 import BeautifulSoup
  6. import sys
  7. reload(sys)
  8. sys.setdefaultencoding('utf-8')
  9.  
  10. parser = argparse.ArgumentParser()
  11. parser.add_argument('--url')
  12. parser.add_argument('-c', type=int)
  13. parser.add_argument('--query')
  14. args = parser.parse_args()
  15. total = 0
  16.  
  17.  
  18. def get_title(url):
  19.     content = requests.request("GET", url).content
  20.     title = BeautifulSoup(content, "html.parser").head.title.string
  21.     print "Title to match: {}\n".format(title)
  22.     return title
  23.  
  24.  
  25. def get_links(url):
  26.     content = requests.request("GET", url).content
  27.     links = BeautifulSoup(content, "html.parser").find("div", class_='mw-parser-output')
  28.     try:
  29.         links = links.find_all('a', href=True)
  30.     except Exception:
  31.         print "Exception with {}".format(url)
  32.     links = [i['href'] for i in links]
  33.     regex = re.compile("^\/wiki\/(?!Plik|Szablon|Pomoc|Portal).*(?<!jpg)$")
  34.     links = filter(regex.match, links)
  35.     return links
  36.  
  37.  
  38. def check_for_query(url, query):
  39.     if query in get_title(url):
  40.         return True
  41.     else:
  42.         return False
  43.  
  44.  
  45. def process_url(url, print_url, query, counter):
  46.     counter = counter + 1
  47.     if counter > args.c:
  48.         print print_url + " -> " + url + "This found nothing and died on counter {}\n".format(counter)
  49.         return
  50.     else:
  51.         print_url = print_url + " -> " + url
  52.         if check_for_query(url, query) is True:
  53.             print "FOUND!!!  " + print_url
  54.             return True
  55.         else:
  56.             for i in get_links(url):
  57.                 target_url = "https://pl.wikipedia.org" + i
  58.                 print "processing {} with counter {}\n".format(target_url, counter)
  59.                 global total
  60.                 total = total + 1
  61.                 if process_url(target_url, print_url, query, counter):
  62.                     return
  63.                     sys.exit(0)
  64.  
  65.  
  66.  
  67. if __name__ == "__main__":
  68.     print get_links(args.url)
  69.     # for i in get_links(args.url):
  70.     #     print i
  71.     process_url(args.url, args.url, args.query, 0)
  72.     print "Total operations done: {}".format(total)
  73.     # process_url(args.url, args.url, args.query, 0)
  74.