Initial version

This commit is contained in:
Gergely Polonkai 2015-06-07 17:03:54 +02:00
commit c8dd28d87f

61
crawler.py Normal file
View File

@ -0,0 +1,61 @@
import urllib2
import sys
from bs4 import BeautifulSoup
import requests
import re
def make_url(link, base_url):
if re.search(r'^[a-z-]+:', link):
return link
return re.sub(r'([^:])//+', r'\1/', base_url + '/' + link)
def main():
checked_links = []
link_cache = []
if len(sys.argv) < 2:
print("Usage: %s <url>" % sys.argv[0])
return 1
base_url = sys.argv[1]
link_cache.append(base_url)
while True:
link = link_cache[0]
link_cache = [x for x in link_cache if x != link]
checked_links.append(link)
print("Checking %s" % link)
r = requests.get(link)
if r.status_code == 200:
soup = BeautifulSoup(r.content)
for a in soup.find_all('a'):
link = make_url(a.get('href'), base_url)
if link.startswith(base_url):
if link not in checked_links:
link_cache.append(link)
else:
print("Skipping checked link %s" % link)
else:
print("Skipping external link %s" % link)
else:
print r.status_code
if len(link_cache) == 0:
break
print("Done. Checked links:")
with open('link_list.txt', 'w') as f:
for link in checked_links:
f.write(link + "\n")
print link
if __name__ == '__main__':
main()