Make internal link checking more sophisticated
This commit is contained in:
parent
84d12fd501
commit
3c61ed87b7
@ -36,6 +36,7 @@ def main(*args):
|
|||||||
return 1
|
return 1
|
||||||
|
|
||||||
base_url = args[0]
|
base_url = args[0]
|
||||||
|
base_parts = urlparse.urlparse(base_url)
|
||||||
link_cache.append(base_url)
|
link_cache.append(base_url)
|
||||||
|
|
||||||
while len(link_cache) > 0:
|
while len(link_cache) > 0:
|
||||||
@ -53,7 +54,12 @@ def main(*args):
|
|||||||
for a in soup.find_all('a'):
|
for a in soup.find_all('a'):
|
||||||
link = urlparse.urljoin(base_url, a.get('href'))
|
link = urlparse.urljoin(base_url, a.get('href'))
|
||||||
|
|
||||||
if link.startswith(base_url):
|
link_parts = urlparse.urlparse(link)
|
||||||
|
|
||||||
|
internal = base_parts.scheme == link_parts.scheme \
|
||||||
|
and base_parts.netloc == link_parts.netloc
|
||||||
|
|
||||||
|
if internal:
|
||||||
if link not in checked_links:
|
if link not in checked_links:
|
||||||
link_cache.append(link)
|
link_cache.append(link)
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user