81 lines
2.3 KiB
Python
81 lines
2.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation, either version 3 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful, but
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
# General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program. If not, see
|
|
# <http://www.gnu.org/licenses/>.
|
|
|
|
"""
|
|
Web link checker script by Gergely Polonkai.
|
|
"""
|
|
|
|
import sys
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
import re
|
|
import urlparse
|
|
|
|
def main(*args):
|
|
# We will store the links already checked here
|
|
checked_links = []
|
|
|
|
# This contains the links that still needs to be checked
|
|
link_cache = []
|
|
|
|
if len(args) < 1:
|
|
print("Usage: %s <url>" % sys.argv[0])
|
|
return 1
|
|
|
|
base_url = args[0]
|
|
base_parts = urlparse.urlparse(base_url)
|
|
link_cache.append(base_url)
|
|
|
|
while len(link_cache) > 0:
|
|
link = link_cache[0]
|
|
link_cache = [x for x in link_cache if x != link]
|
|
checked_links.append(link)
|
|
|
|
print("Checking %s" % link)
|
|
|
|
r = requests.get(link)
|
|
|
|
if r.status_code == 200:
|
|
soup = BeautifulSoup(r.content)
|
|
|
|
for a in soup.find_all('a'):
|
|
link = urlparse.urljoin(base_url, a.get('href'))
|
|
|
|
link_parts = urlparse.urlparse(link)
|
|
|
|
internal = base_parts.scheme == link_parts.scheme \
|
|
and base_parts.netloc == link_parts.netloc
|
|
|
|
if internal:
|
|
if link not in checked_links:
|
|
link_cache.append(link)
|
|
else:
|
|
print("Skipping checked link %s" % link)
|
|
else:
|
|
print("Skipping external link %s" % link)
|
|
else:
|
|
print r.status_code
|
|
|
|
print("Done. Checked links:")
|
|
|
|
with open('link_list.txt', 'w') as f:
|
|
for link in checked_links:
|
|
f.write(link + "\n")
|
|
print link
|
|
|
|
if __name__ == '__main__':
|
|
main(*sys.argv[1:])
|