commit c8dd28d87f2e1c971d8841c3c77a8571d5814365 Author: Gergely Polonkai Date: Sun Jun 7 17:03:54 2015 +0200 Initial version diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..7968837 --- /dev/null +++ b/crawler.py @@ -0,0 +1,61 @@ +import urllib2 +import sys +from bs4 import BeautifulSoup +import requests +import re + +def make_url(link, base_url): + if re.search(r'^[a-z-]+:', link): + return link + + return re.sub(r'([^:])//+', r'\1/', base_url + '/' + link) + +def main(): + checked_links = [] + link_cache = [] + + if len(sys.argv) < 2: + print("Usage: %s " % sys.argv[0]) + return 1 + + base_url = sys.argv[1] + + link_cache.append(base_url) + + while True: + link = link_cache[0] + link_cache = [x for x in link_cache if x != link] + checked_links.append(link) + + print("Checking %s" % link) + + r = requests.get(link) + + if r.status_code == 200: + soup = BeautifulSoup(r.content) + + for a in soup.find_all('a'): + link = make_url(a.get('href'), base_url) + + if link.startswith(base_url): + if link not in checked_links: + link_cache.append(link) + else: + print("Skipping checked link %s" % link) + else: + print("Skipping external link %s" % link) + else: + print r.status_code + + if len(link_cache) == 0: + break + + print("Done. Checked links:") + + with open('link_list.txt', 'w') as f: + for link in checked_links: + f.write(link + "\n") + print link + +if __name__ == '__main__': + main()