120 lines
3.7 KiB
Python
120 lines
3.7 KiB
Python
# -*- coding: utf-8 -*-
|
||
#
|
||
# This program is free software: you can redistribute it and/or modify
|
||
# it under the terms of the GNU General Public License as published by
|
||
# the Free Software Foundation, either version 3 of the License, or
|
||
# (at your option) any later version.
|
||
#
|
||
# This program is distributed in the hope that it will be useful, but
|
||
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
# General Public License for more details.
|
||
#
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with this program. If not, see
|
||
# <http://www.gnu.org/licenses/>.
|
||
|
||
"""
|
||
Web link checker script by Gergely Polonkai.
|
||
"""
|
||
|
||
import sys
|
||
from bs4 import BeautifulSoup
|
||
import requests
|
||
import re
|
||
import urlparse
|
||
|
||
import pprint
|
||
|
||
pp = pprint.PrettyPrinter(indent=4)
|
||
|
||
def _is_internal(base_parts, link_parts):
|
||
return base_parts.scheme == link_parts.scheme \
|
||
and base_parts.netloc == link_parts.netloc
|
||
|
||
def _update_link(checked_links, base_parts, link, checked=False, initial=False):
|
||
was_checked = False
|
||
link_parts = urlparse.urlparse(link)
|
||
|
||
if link in checked_links:
|
||
was_checked = checked_links[link]['checked']
|
||
checked_links[link]['checked'] = checked_links[link]['checked'] \
|
||
or checked
|
||
if not initial:
|
||
checked_links[link]['count'] += 1
|
||
else:
|
||
checked_links[link] = {
|
||
'checked': checked,
|
||
'count': 0 if initial else 1,
|
||
'external': not _is_internal(base_parts, link_parts),
|
||
}
|
||
|
||
return was_checked
|
||
|
||
def main(*args):
|
||
# We will store the links already checked here
|
||
checked_links = {}
|
||
|
||
if len(args) < 1:
|
||
print("Usage: %s <url>" % sys.argv[0])
|
||
|
||
return 1
|
||
|
||
base_url = args[0]
|
||
base_parts = urlparse.urlparse(base_url)
|
||
_update_link(checked_links, base_parts, base_url, initial=True)
|
||
|
||
while len([x for x in checked_links \
|
||
if checked_links[x]['checked'] == False]) > 0:
|
||
|
||
current_link = [x for x in checked_links \
|
||
if checked_links[x]['checked'] == False][0]
|
||
current_parts = urlparse.urlparse(current_link)
|
||
|
||
print("Checking %s" % current_link)
|
||
|
||
_update_link(checked_links, base_parts, current_link, checked=True)
|
||
|
||
checked_links[current_link]['uncheckable'] = current_parts.scheme \
|
||
not in ('http', 'httus',)
|
||
|
||
if checked_links[current_link]['uncheckable']:
|
||
continue
|
||
|
||
response = requests.get(current_link, allow_redirects=False)
|
||
|
||
if response.ok:
|
||
checked_links[current_link]['broken'] = False
|
||
# If we are being redirected, add the redirect link to
|
||
# link_cache. We will check them later
|
||
if response.is_redirect:
|
||
_update_link(
|
||
checked_links,
|
||
base_parts,
|
||
response.headers['location'],
|
||
initial=True)
|
||
checked_links[current_link]['redirect'] = True
|
||
|
||
continue
|
||
else:
|
||
checked_links[current_link]['redirect'] = False
|
||
|
||
# Don’t crawl external pages
|
||
if checked_links[current_link]['external']:
|
||
continue
|
||
|
||
soup = BeautifulSoup(response.content)
|
||
|
||
for a in soup.find_all('a'):
|
||
link = urlparse.urljoin(base_url, a.get('href'))
|
||
link_parts = urlparse.urlparse(link)
|
||
|
||
_update_link(checked_links, base_parts, link)
|
||
else:
|
||
checked_links[current_link]['broken'] = True
|
||
|
||
pp.pprint(checked_links)
|
||
|
||
if __name__ == '__main__':
|
||
main(*sys.argv[1:])
|