From 3c61ed87b711828d61a800068bbeb8ad852dcdff Mon Sep 17 00:00:00 2001 From: Gergely Polonkai Date: Mon, 31 Aug 2015 15:09:04 +0000 Subject: [PATCH] Make internal link checking more sophisticated --- crawler.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crawler.py b/crawler.py index 2aade88..269b8af 100644 --- a/crawler.py +++ b/crawler.py @@ -36,6 +36,7 @@ def main(*args): return 1 base_url = args[0] + base_parts = urlparse.urlparse(base_url) link_cache.append(base_url) while len(link_cache) > 0: @@ -53,7 +54,12 @@ def main(*args): for a in soup.find_all('a'): link = urlparse.urljoin(base_url, a.get('href')) - if link.startswith(base_url): + link_parts = urlparse.urlparse(link) + + internal = base_parts.scheme == link_parts.scheme \ + and base_parts.netloc == link_parts.netloc + + if internal: if link not in checked_links: link_cache.append(link) else: